Page MenuHomec4science

run.py
No OneTemporary

File Metadata

Created
Sat, Jul 5, 17:44
import pandas as pd
import numpy as np
import sklearn as sk
#import seaborn as sns
#import matplotlib.pyplot as pyplot
#import matplotlib.pyplot as plt
from sklearn.kernel_approximation import Nystroem
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.cluster import DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
#local import
import compute_stats as cs
from clean_data import run_data_cleaning
def pre_processing (data, norm=False, std=False) :
"""
Apply a transformation to the data.
Options are standardization or normalization (using L2 norm).
By default : no transformation applied to data
"""
result = data
if norm :
result = Normalizer().fit_transform(data)
if std :
result = StandardScaler().fit_transform(data)
return result
def prepare_data_pca (file_names, norm=False, std=False, power_trans=False, cov_rem=False, DBSCAN_rem=False, quant_rem=False) :
"""
Prepare the data for Ridge Regression. A discussion in the report is provided to explain each method used to do the supression of outliers.
In the report and in the jupyter notebook one can find the heatmap of the data that explains one some features are dropeed and other are kept.
We strongly encourage the reader to have a look at the notebook Ridge_Regression_analysis, more specifically at part about "Data Exploratory Analysis".
"""
# load many possible data files (for instance year 2016 and 2017)
if type(file_names) == str:
file_names = [file_names]
dfs = []
for file_name in file_names:
local_df = pd.read_csv(file_name)
dfs.append(local_df)
df = pd.concat(dfs, sort = False)
# prepares the target
y = df['Produktion [kWh]**'].values
emplacement = df['Anlage_Ort # Emplacement de installation'].values
# drop the target from the data
df = df.drop(columns=["Produktion [kWh]**", 'Anlage_Ort # Emplacement de installation'])
# remove features selected by hand & using cross validation
df = df.drop(columns=['Total Anlage','natürliche Personen', 'Population: Habitants', 'Répartition par âge en %: 0-19 ans','Mouvement de la population (en ‰): Taux brut de nuptialité',
'Mouvement de la population (en ‰): Taux brut de divortialité','Mouvement de la population (en ‰): Taux brut de natalité', 'Economie: Secteur primaire',
'Economie: Secteur secondaire', 'Economie: Secteur tertiaire','Economie: Secteur primaire.1','Economie: Secteur secondaire.1', 'Economie: Secteur tertiaire.1',
'Répartition par âge en %: 65 ans ou plus'])
df = df.drop(columns=["Constructions et logements: Taux de logements vacants",'Ménage: Ménages privés', 'Economie: Emplois total', 'Surface: Variation en ha.1', 'C5', 'C2'])
X = df.values
# methods to remove outliers
if cov_rem :
cov = EllipticEnvelope(random_state=0).fit(X)
outliers_cov = cov.predict(X)
X = X[outliers_cov != -1]
y = y[outliers_cov != -1]
emplacement = emplacement[outliers_cov != -1]
if DBSCAN_rem :
outlier_detection = DBSCAN(eps = 1000, metric="euclidean",min_samples = 3)
clusters = outlier_detection.fit_predict(X)
X = X[clusters != -1]
y = y[clusters != -1]
emplacement = emplacement[clusters != -1]
#print(clusters[clusters != -1].shape)
if quant_rem :
X = quantile_transform(X, n_quantiles=10, random_state=0, copy=True)
# aplies a power transform to the data
if power_trans :
pt = PowerTransformer()
X = pt.fit_transform(X)
# if the data has to be standardize or normalize
X = pre_processing(X, norm, std)
return X, y, df, emplacement
def kernel_test_nystroem (gammas, n_components, filename) :
"""
Function that produces the best run for the project. We are using DBSCAN to remove the outliers, we are training on years 2016, 2017
annd tesing on years 2018.
Warning : to get this result we applied a strategy that is developed in the the jupyer notebook
"""
rmse_saved = []
for g in gammas :
for n in n_components :
x_train, y_train, _ , _ = prepare_data_pca(("./merged_2016.csv","./merged_2017.csv"), norm=False, std=True, power_trans=False, cov_rem=False, DBSCAN_rem=True, quant_rem=False )
x_test, y_test, _ , emplacement = prepare_data_pca("./merged_2018.csv", norm=False, std=True, power_trans=False, cov_rem=False, DBSCAN_rem=True, quant_rem=False )
pca = PCA(n_components = x_train.shape[1])
x_train = pca.fit_transform(x_train, y_train)
x_test = pca.transform(x_test)
feature_map_nystroem = Nystroem(gamma=g, random_state=1, n_components=n)
x_train = feature_map_nystroem.fit_transform(x_train)
x_test = feature_map_nystroem.transform(x_test)
#x_train,x_test,y_train,y_test=train_test_split(data_transformed,y, test_size=0.3,random_state=3)
model = RidgeCV(alphas=[0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4, 5]).fit(x_train, y_train)
final_score = model.score(x_test, y_test)
y_prediction = model.predict(x_test)
diff = np.abs(y_prediction - y_test)
rmse = mean_squared_error(y_test, y_prediction)
cs.compute_statistics_model(y_prediction, y_test, print_graph = False)
rmse_saved.append(rmse)
pred_1 = pd.DataFrame({"Anlage_Ort # Emplacement de installation":emplacement,"Prediction":y_prediction})
pred_1.to_csv(filename, index=False)
return rmse_saved, diff
if __name__ == '__main__':
print("Begin data cleaning ...")
run_data_cleaning()
print("Data cleaning done!\nBegin training and fitting ...")
# Best result configuration
filename = "pred_2018_ridge_regression_best.csv"
kernel_test_nystroem([0.02], [2401], filename)
print("Training done! Predictions done!\nPredictions saved under {}".format(filename))

Event Timeline