Page MenuHomec4science

run.py
No OneTemporary

File Metadata

Created
Sun, Feb 23, 22:52
# Martin Fontanet, Dennis Gankin, Vikalp Kamdar
import numpy as np
from proj1_helpers import *
from functions import *
import itertools
exec(open("functions.py").read())
# Loading the data
y_tr, x_tr_noisy, ids_tr = load_csv_data("train.csv")
y_te,x_te_noisy,ids_te = load_csv_data("test.csv")
# we set the prediction alphabet as {0,1} instead of {-1,1} give it the correct shape
y_tr = np.expand_dims(y_tr,axis=1)
y_tr = (y_tr+1)/2
# Computing the new values for x_tr and x_te
print("Treating the sample values...")
new_x_tr, new_x_te = compute_new_x(x_tr_noisy,x_te_noisy)
# We standardize the values in the new version of x (computed by machine learning) and transorm it using the polynomial basis
degree = 3
x_tr, x_te = standardize_data(new_x_tr, new_x_te)
tx_tr = build_poly_matrix(x_tr,degree)
print("Done")
#Initial w
w = np.zeros((tx_tr.shape[1], 1))
# number of samples used at each step
subset_size = 800
# List of the prediction of w (and the score) of each iteration (will be sorted later)
sorted_ws = []
#List of the prediction of w for each sample subset
best_of_subsets=[]
# We perform the computation several times with a different shuffling (= different seeds)
nb_seeds = 20
total_iterations = int(nb_seeds*len(y_tr)/subset_size)
print("Computation of the weights")
for seed in range(nb_seeds):
#We shuffle the data sample set
np.random.seed(seed)
shuffled_indices = np.arange(len(y_tr))
np.random.shuffle(shuffled_indices)
y_to_use = y_tr[shuffled_indices]
tx_to_use = tx_tr[shuffled_indices]
print(int(seed*len(y_to_use)/subset_size)+1,"/",total_iterations)
# We compute w with the first subset outside the loop in order to create the list of w's
w_i= batch_penalized_newton(y_to_use,tx_to_use,w,0,subset_size)
score = compute_score(y_tr,tx_tr,w_i.T)
sorted_ws.append((score,w_i))
w_list = [(score,w_i)]
for i in range(subset_size,len(y_to_use),subset_size):
print(int(int(i/subset_size)+1+seed*len(y_to_use)/subset_size),"/",total_iterations)
# We compute w by penalized gradient and compute its score
w_i = batch_penalized_newton(y_to_use,tx_to_use,w,i,i+subset_size)
score = compute_score(y_tr,tx_tr,w_i.T)
sorted_ws.append((score,w_i))
w_list.append((score,w_i))
w_list.sort(key=lambda tup: tup[0], reverse=True)
# We check the best combination of w (the best mean)
score,w_of_subset = best_mean(w_list, w_i, score)
best_of_subsets.append((score,w_of_subset))
w = w_of_subset
score = compute_score(y_tr,tx_tr,w.T)
max_score = score
best_w = w
# We sort the list of (scores,w's) by score (in decreasing order)
sorted_ws.extend(best_of_subsets)
sorted_ws.sort(key=lambda tup: tup[0], reverse=True)
print("Optimizing the weights...")
# We check the best combination of w (the best mean)
max_score, w = best_mean(sorted_ws, best_w, max_score)
print("Done")
print("Score :",max_score)
create_submission(new_x_tr,new_x_te, w.T)

Event Timeline