run.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jul 5, 17:44

run.py
View Options

	import pandas as pd
	import numpy as np
	import sklearn as sk
	#import seaborn as sns
	#import matplotlib.pyplot as pyplot
	#import matplotlib.pyplot as plt

	from sklearn.kernel_approximation import Nystroem
	from sklearn.decomposition import PCA
	from sklearn.linear_model import Ridge
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import RidgeCV
	from sklearn.metrics import mean_squared_error
	from sklearn.cluster import DBSCAN
	from sklearn.covariance import EllipticEnvelope

	from sklearn.feature_selection import SelectKBest
	from sklearn.feature_selection import f_regression

	from sklearn.preprocessing import quantile_transform
	from sklearn.preprocessing import PowerTransformer
	from sklearn.preprocessing import RobustScaler
	from sklearn.preprocessing import Normalizer
	from sklearn.preprocessing import StandardScaler

	#local import
	import compute_stats as cs
	from clean_data import run_data_cleaning

	def pre_processing (data, norm=False, std=False) :
	"""
	Apply a transformation to the data.
	Options are standardization or normalization (using L2 norm).
	By default : no transformation applied to data
	"""
	result = data
	if norm :
	result = Normalizer().fit_transform(data)
	if std :
	result = StandardScaler().fit_transform(data)
	return result


	def prepare_data_pca (file_names, norm=False, std=False, power_trans=False, cov_rem=False, DBSCAN_rem=False, quant_rem=False) :
	"""
	Prepare the data for Ridge Regression. A discussion in the report is provided to explain each method used to do the supression of outliers.
	In the report and in the jupyter notebook one can find the heatmap of the data that explains one some features are dropeed and other are kept.
	We strongly encourage the reader to have a look at the notebook Ridge_Regression_analysis, more specifically at part about "Data Exploratory Analysis".
	"""
	# load many possible data files (for instance year 2016 and 2017)
	if type(file_names) == str:
	file_names = [file_names]
	dfs = []
	for file_name in file_names:
	local_df = pd.read_csv(file_name)
	dfs.append(local_df)
	df = pd.concat(dfs, sort = False)

	# prepares the target
	y = df['Produktion [kWh]**'].values
	emplacement = df['Anlage_Ort # Emplacement de installation'].values

	# drop the target from the data
	df = df.drop(columns=["Produktion [kWh]**", 'Anlage_Ort # Emplacement de installation'])


	# remove features selected by hand & using cross validation
	df = df.drop(columns=['Total Anlage','natürliche Personen', 'Population: Habitants', 'Répartition par âge en %: 0-19 ans','Mouvement de la population (en ‰): Taux brut de nuptialité',
	'Mouvement de la population (en ‰): Taux brut de divortialité','Mouvement de la population (en ‰): Taux brut de natalité', 'Economie: Secteur primaire',
	'Economie: Secteur secondaire', 'Economie: Secteur tertiaire','Economie: Secteur primaire.1','Economie: Secteur secondaire.1', 'Economie: Secteur tertiaire.1',
	'Répartition par âge en %: 65 ans ou plus'])

	df = df.drop(columns=["Constructions et logements: Taux de logements vacants",'Ménage: Ménages privés', 'Economie: Emplois total', 'Surface: Variation en ha.1', 'C5', 'C2'])
	X = df.values

	# methods to remove outliers
	if cov_rem :
	cov = EllipticEnvelope(random_state=0).fit(X)
	outliers_cov = cov.predict(X)
	X = X[outliers_cov != -1]
	y = y[outliers_cov != -1]
	emplacement = emplacement[outliers_cov != -1]

	if DBSCAN_rem :
	outlier_detection = DBSCAN(eps = 1000, metric="euclidean",min_samples = 3)
	clusters = outlier_detection.fit_predict(X)
	X = X[clusters != -1]
	y = y[clusters != -1]
	emplacement = emplacement[clusters != -1]
	#print(clusters[clusters != -1].shape)


	if quant_rem :
	X = quantile_transform(X, n_quantiles=10, random_state=0, copy=True)

	# aplies a power transform to the data
	if power_trans :
	pt = PowerTransformer()
	X = pt.fit_transform(X)

	# if the data has to be standardize or normalize
	X = pre_processing(X, norm, std)

	return X, y, df, emplacement

	def kernel_test_nystroem (gammas, n_components, filename) :
	"""
	Function that produces the best run for the project. We are using DBSCAN to remove the outliers, we are training on years 2016, 2017
	annd tesing on years 2018.
	Warning : to get this result we applied a strategy that is developed in the the jupyer notebook
	"""
	rmse_saved = []
	for g in gammas :
	for n in n_components :
	x_train, y_train, _ , _ = prepare_data_pca(("./merged_2016.csv","./merged_2017.csv"), norm=False, std=True, power_trans=False, cov_rem=False, DBSCAN_rem=True, quant_rem=False )
	x_test, y_test, _ , emplacement = prepare_data_pca("./merged_2018.csv", norm=False, std=True, power_trans=False, cov_rem=False, DBSCAN_rem=True, quant_rem=False )
	pca = PCA(n_components = x_train.shape[1])
	x_train = pca.fit_transform(x_train, y_train)
	x_test = pca.transform(x_test)

	feature_map_nystroem = Nystroem(gamma=g, random_state=1, n_components=n)
	x_train = feature_map_nystroem.fit_transform(x_train)
	x_test = feature_map_nystroem.transform(x_test)
	#x_train,x_test,y_train,y_test=train_test_split(data_transformed,y, test_size=0.3,random_state=3)

	model = RidgeCV(alphas=[0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4, 5]).fit(x_train, y_train)
	final_score = model.score(x_test, y_test)
	y_prediction = model.predict(x_test)
	diff = np.abs(y_prediction - y_test)
	rmse = mean_squared_error(y_test, y_prediction)
	cs.compute_statistics_model(y_prediction, y_test, print_graph = False)
	rmse_saved.append(rmse)

	pred_1 = pd.DataFrame({"Anlage_Ort # Emplacement de installation":emplacement,"Prediction":y_prediction})
	pred_1.to_csv(filename, index=False)
	return rmse_saved, diff

	if __name__ == '__main__':
	print("Begin data cleaning ...")
	run_data_cleaning()

	print("Data cleaning done!\nBegin training and fitting ...")
	# Best result configuration
	filename = "pred_2018_ridge_regression_best.csv"
	kernel_test_nystroem([0.02], [2401], filename)
	print("Training done! Predictions done!\nPredictions saved under {}".format(filename))

run.pyNo OneTemporaryActions

File Metadata

run.pyView Options

Event Timeline

run.py
No OneTemporary
Actions

run.py
View Options