Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F102762701
run.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Feb 23, 22:52
Size
2 KB
Mime Type
text/x-python
Expires
Tue, Feb 25, 22:52 (2 d)
Engine
blob
Format
Raw Data
Handle
24418111
Attached To
rTZUCT ML_Project1
run.py
View Options
# Martin Fontanet, Dennis Gankin, Vikalp Kamdar
import
numpy
as
np
from
proj1_helpers
import
*
from
functions
import
*
import
itertools
exec
(
open
(
"functions.py"
)
.
read
())
# Loading the data
y_tr
,
x_tr_noisy
,
ids_tr
=
load_csv_data
(
"train.csv"
)
y_te
,
x_te_noisy
,
ids_te
=
load_csv_data
(
"test.csv"
)
# we set the prediction alphabet as {0,1} instead of {-1,1} give it the correct shape
y_tr
=
np
.
expand_dims
(
y_tr
,
axis
=
1
)
y_tr
=
(
y_tr
+
1
)
/
2
# Computing the new values for x_tr and x_te
print
(
"Treating the sample values..."
)
new_x_tr
,
new_x_te
=
compute_new_x
(
x_tr_noisy
,
x_te_noisy
)
# We standardize the values in the new version of x (computed by machine learning) and transorm it using the polynomial basis
degree
=
3
x_tr
,
x_te
=
standardize_data
(
new_x_tr
,
new_x_te
)
tx_tr
=
build_poly_matrix
(
x_tr
,
degree
)
print
(
"Done"
)
#Initial w
w
=
np
.
zeros
((
tx_tr
.
shape
[
1
],
1
))
# number of samples used at each step
subset_size
=
800
# List of the prediction of w (and the score) of each iteration (will be sorted later)
sorted_ws
=
[]
#List of the prediction of w for each sample subset
best_of_subsets
=
[]
# We perform the computation several times with a different shuffling (= different seeds)
nb_seeds
=
20
total_iterations
=
int
(
nb_seeds
*
len
(
y_tr
)
/
subset_size
)
print
(
"Computation of the weights"
)
for
seed
in
range
(
nb_seeds
):
#We shuffle the data sample set
np
.
random
.
seed
(
seed
)
shuffled_indices
=
np
.
arange
(
len
(
y_tr
))
np
.
random
.
shuffle
(
shuffled_indices
)
y_to_use
=
y_tr
[
shuffled_indices
]
tx_to_use
=
tx_tr
[
shuffled_indices
]
print
(
int
(
seed
*
len
(
y_to_use
)
/
subset_size
)
+
1
,
"/"
,
total_iterations
)
# We compute w with the first subset outside the loop in order to create the list of w's
w_i
=
batch_penalized_newton
(
y_to_use
,
tx_to_use
,
w
,
0
,
subset_size
)
score
=
compute_score
(
y_tr
,
tx_tr
,
w_i
.
T
)
sorted_ws
.
append
((
score
,
w_i
))
w_list
=
[(
score
,
w_i
)]
for
i
in
range
(
subset_size
,
len
(
y_to_use
),
subset_size
):
print
(
int
(
int
(
i
/
subset_size
)
+
1
+
seed
*
len
(
y_to_use
)
/
subset_size
),
"/"
,
total_iterations
)
# We compute w by penalized gradient and compute its score
w_i
=
batch_penalized_newton
(
y_to_use
,
tx_to_use
,
w
,
i
,
i
+
subset_size
)
score
=
compute_score
(
y_tr
,
tx_tr
,
w_i
.
T
)
sorted_ws
.
append
((
score
,
w_i
))
w_list
.
append
((
score
,
w_i
))
w_list
.
sort
(
key
=
lambda
tup
:
tup
[
0
],
reverse
=
True
)
# We check the best combination of w (the best mean)
score
,
w_of_subset
=
best_mean
(
w_list
,
w_i
,
score
)
best_of_subsets
.
append
((
score
,
w_of_subset
))
w
=
w_of_subset
score
=
compute_score
(
y_tr
,
tx_tr
,
w
.
T
)
max_score
=
score
best_w
=
w
# We sort the list of (scores,w's) by score (in decreasing order)
sorted_ws
.
extend
(
best_of_subsets
)
sorted_ws
.
sort
(
key
=
lambda
tup
:
tup
[
0
],
reverse
=
True
)
print
(
"Optimizing the weights..."
)
# We check the best combination of w (the best mean)
max_score
,
w
=
best_mean
(
sorted_ws
,
best_w
,
max_score
)
print
(
"Done"
)
print
(
"Score :"
,
max_score
)
create_submission
(
new_x_tr
,
new_x_te
,
w
.
T
)
Event Timeline
Log In to Comment