File Metadata

Created: Sat, Jul 12, 13:16

run.py
View Options

	# Martin Fontanet, Dennis Gankin, Vikalp Kamdar

	import numpy as np
	from proj1_helpers import *
	from functions import *
	import itertools


	exec(open("functions.py").read())

	# Loading the data
	y_tr, x_tr_noisy, ids_tr = load_csv_data("train.csv")
	y_te,x_te_noisy,ids_te = load_csv_data("test.csv")

	# we set the prediction alphabet as {0,1} instead of {-1,1} give it the correct shape
	y_tr = np.expand_dims(y_tr,axis=1)
	y_tr = (y_tr+1)/2


	# Computing the new values for x_tr and x_te
	print("Treating the sample values...")
	new_x_tr, new_x_te = compute_new_x(x_tr_noisy,x_te_noisy)


	# We standardize the values in the new version of x (computed by machine learning) and transorm it using the polynomial basis
	degree = 3
	x_tr, x_te = standardize_data(new_x_tr, new_x_te)
	tx_tr = build_poly_matrix(x_tr,degree)
	print("Done")

	#Initial w
	w = np.zeros((tx_tr.shape[1], 1))

	# number of samples used at each step
	subset_size = 800

	# List of the prediction of w (and the score) of each iteration (will be sorted later)
	sorted_ws = []

	#List of the prediction of w for each sample subset
	best_of_subsets=[]

	# We perform the computation several times with a different shuffling (= different seeds)
	nb_seeds = 20
	total_iterations = int(nb_seeds*len(y_tr)/subset_size)
	print("Computation of the weights")
	for seed in range(nb_seeds):
	#We shuffle the data sample set
	np.random.seed(seed)
	shuffled_indices = np.arange(len(y_tr))
	np.random.shuffle(shuffled_indices)
	y_to_use = y_tr[shuffled_indices]
	tx_to_use = tx_tr[shuffled_indices]


	print(int(seed*len(y_to_use)/subset_size)+1,"/",total_iterations)

	# We compute w with the first subset outside the loop in order to create the list of w's
	w_i= batch_penalized_newton(y_to_use,tx_to_use,w,0,subset_size)

	score = compute_score(y_tr,tx_tr,w_i.T)
	sorted_ws.append((score,w_i))
	w_list = [(score,w_i)]

	for i in range(subset_size,len(y_to_use),subset_size):
	print(int(int(i/subset_size)+1+seed*len(y_to_use)/subset_size),"/",total_iterations)

	# We compute w by penalized gradient and compute its score
	w_i = batch_penalized_newton(y_to_use,tx_to_use,w,i,i+subset_size)
	score = compute_score(y_tr,tx_tr,w_i.T)
	sorted_ws.append((score,w_i))

	w_list.append((score,w_i))

	w_list.sort(key=lambda tup: tup[0], reverse=True)

	# We check the best combination of w (the best mean)
	score,w_of_subset = best_mean(w_list, w_i, score)

	best_of_subsets.append((score,w_of_subset))

	w = w_of_subset
	score = compute_score(y_tr,tx_tr,w.T)

	max_score = score
	best_w = w

	# We sort the list of (scores,w's) by score (in decreasing order)
	sorted_ws.extend(best_of_subsets)
	sorted_ws.sort(key=lambda tup: tup[0], reverse=True)

	print("Optimizing the weights...")

	# We check the best combination of w (the best mean)
	max_score, w = best_mean(sorted_ws, best_w, max_score)

	print("Done")

	print("Score :",max_score)

	create_submission(new_x_tr,new_x_te, w.T)

run.py
No OneTemporary
Actions

File Metadata

run.py
View Options

Event Timeline

run.pyNo OneTemporaryActions

File Metadata

run.pyView Options

Event Timeline

run.py
No OneTemporary
Actions

run.py
View Options