helpers_linear_regression.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Jul 8, 12:48

helpers_linear_regression.py
View Options

	import numpy as np
	import matplotlib.pyplot as plt
	import pandas as pd
	from datetime import timedelta
	from copy import deepcopy
	from sklearn.metrics import mean_squared_error

	import warnings
	warnings.filterwarnings('ignore')

	from helpers_clustering import create_weekly_matrix


	def regularize_data(energy_consumptions, atypical_weeks, model_week, data_per_week = 672, data_per_day = 96, total_weeks = 98):
	"""
	Creation of a new dataset "regularized_energy_consumptions" where the atypical days that charcaterize the atypical
	weeks found using the clustering, are replace with the corresponding days of the model week.
	Moreover the dataset "df_atypical_weeks", containing the number of atypical
	days for each week, is assembled

	:param DataFrame energy_consumptions: formatted DataFrame containing measurements of energy consumption.
	Index is of type DatetimeIndex,
	column 'consumption' contains the measurement,
	column 'week' contains the week code of the measurement.
	:param vector atypical_weeks: indexes of the weeks that are classified as atypical by the clustering phase
	:param vector model_week: model week computed using the median of the normal weeks
	:param int data_per_week: number of measurement for every week (default value is 672, a measurement every 15 minutes)
	:param int data_per_day: number of measurement for every week (default value is 96, a measurement every 15 minutes)
	:param int total_weeks: number of total weeks in the dataset (default value is 98)
	:output DataFrame regularized_energy_consumptions: formatted DataFrame containing measurements of energy consumption.
	Index is of type DatetimeIndex,
	column 'consumption' contains the measurement,
	column 'week' contains the week code of the measurement,
	column 'atypical day' contains a bool indicating if that measurament
	was replaced with the model one.
	:output DataFrame df_atypical_weeks: DataFrame containing information about
	the number of detected atypical days.
	Index is of type int and contains the week number,
	column 'Atypical days number' contains the number of atypical
	days found in that week.

	"""

	regularized_energy_consumptions = deepcopy(energy_consumptions)
	regularized_energy_consumptions['atypical day'] = False

	# day_model is a list of 7 arrays containing the model for every day of the week
	day_model = []
	for i in range(0, 7):
	day_model.append(model_week[idata_per_day:(i+1)data_per_day])

	# X = (num weeks, num data per week)
	X = create_weekly_matrix(energy_consumptions)
	X_atypical = X[atypical_weeks,:]


	# Create a dataframe containing the atypical weeks and indicating how many atypical days there are in each one
	df_atypical_weeks = pd.DataFrame({'Atypical days number' :np.zeros(total_weeks)})
	df_atypical_weeks.index.names = ['Week number']

	# Computing the relative error of every measurement with respect to the model week
	error_anomalous = np.abs(X_atypical - model_week)
	relative_error = error_anomalous/model_week
	num_atypical_weeks = np.shape(X_atypical)[0]
	columns_day = np.arange(data_per_day)

	# Counting the number of anomalies in each day of the atypcial weeks:
	# a measurement is anomalous if the relative error is greater than 30%
	anomalies_perday = np.zeros((num_atypical_weeks, 7))
	for week_index in range(num_atypical_weeks):
	for day_index in range(7):
	anomalies_perday[week_index, day_index] = (relative_error[week_index, day_index*data_per_day+columns_day] > 0.3).sum()

	# Indacting for each day of the week, in which week nuber it has been
	# detected as anomalous.
	# A day is anomalous if the number of anomalies in that day is
	# greater than 30, meaning that more than 31% of the measurement in that day
	# are anomalous

	days = []
	for day_index in range(7):
	current_day_anoumalous_weeks = []
	for week_index in range(num_atypical_weeks):

	if anomalies_perday[week_index, day_index] > 30:
	current_day_anoumalous_weeks.append(atypical_weeks[week_index])
	days.append(current_day_anoumalous_weeks)

	# For each week, count how many atypical days are detected
	for week_index in atypical_weeks:
	count = 0
	for i in range(7):

	# Weeks that has atypical days on days i
	weeks_i = np.asarray(days[i])
	count += np.count_nonzero(weeks_i == week_index)
	df_atypical_weeks.iloc[week_index, 0] = count

	# In the dataset regularized_energy_consumptions, replace the detected
	# atypical days measurements with the corresponding values of the model week.
	rows = np.arange(data_per_day)
	for day_index, day in enumerate(days):
	for global_week in day:
	regularized_energy_consumptions.iloc[rows + data_per_dayday_index + global_weekdata_per_week, 0] = day_model[day_index]
	regularized_energy_consumptions.iloc[rows + data_per_dayday_index + global_weekdata_per_week, 2] = True

	return regularized_energy_consumptions, df_atypical_weeks

	def lm_dataframe_all_features(regularized_energy_consumptions, lags, data_per_week = 672, data_per_day = 96, total_weeks = 98):
	"""
	Creating a copy of the initial dataframe that will be used to construct the linear regression model.
	Lags are used as features, as well as some categorical ones. This function
	consider all the posssible features, without prevoius feature selection.

	:param DataFrame regularized_energy_consumptions: formatted DataFrame containing measurements of energy consumption.
	Index is of type DatetimeIndex,
	column 'consumption' contains the measurement,
	column 'week' contains the week code of the measurement.
	column 'atypical day' contains a bool indicating if that measurament
	was replaced with the model one.
	:param vector lags: indicates which previous measurament are used as features for the regression model
	:param int data_per_week: number of measurement for every week (default value is 672, a measurement every 15 minutes)
	:param int data_per_day: number of measurement for every week (default value is 96, a measurement every 15 minutes)
	:param int total_weeks: number of total weeks in the dataset (default value is 98)
	:output DataFrame data_lm: DataFrame containing target and features that will be
	used for linear regression.
	Index is of type DatetimeIndex,
	column 'y' contains the targets, i.e. the value that
	the time series ssume in each point
	the remaining column contains different typed of features.
	"""
	# Creating a copy of the initial dataframe and use it to construct the linear regression model
	data_lm = pd.DataFrame(regularized_energy_consumptions.consumption.copy())
	data_lm.columns = ["y"]


	# Add previous lags indicated in "lags"
	data_lm = add_lags(data_lm, lags)

	# Categorical features:

	# in which day of the week
	data_lm = add_categorical_day(data_lm)
	# if the measurement is in the central part of a day
	data_lm = add_categorical_central_part(data_lm)
	# if the measurement is in the central part of a working day or in a sunday
	data_lm = add_categorical_workday(data_lm)
	# in which month of the year
	data_lm = add_categorical_month(data_lm)

	return data_lm


	def lm_dataframe_categorical_features(regularized_energy_consumptions, lags, data_per_week = 672, data_per_day = 96, total_weeks = 98):
	"""
	Creating a copy of the initial dataframe that will be used to construct the linear regression model.
	Lags are used as features, as well as some categorical ones. This function
	consider only the feature selected by the feature selection process.

	:param DataFrame regularized_energy_consumptions: formatted DataFrame containing measurements of energy consumption.
	Index is of type DatetimeIndex,
	column 'consumption' contains the measurement,
	column 'week' contains the week code of the measurement.
	column 'atypical day' contains a bool indicating if that measurament
	was replaced with the model one.
	:param vector lags: indicates which previous measurament are used as features for the regression model
	:param int data_per_week: number of measurement for every week (default value is 672, a measurement every 15 minutes)
	:param int data_per_day: number of measurement for every week (default value is 96, a measurement every 15 minutes)
	:param int total_weeks: number of total weeks in the dataset (default value is 98)
	:output DataFrame data_lm: DataFrame containing target and features that will be
	used for linear regression.
	Index is of type DatetimeIndex,
	column 'y' contains the targets, i.e. the value that
	the time series ssume in each point
	the remaining column contains different typed of features.
	"""
	# Creating a copy of the initial dataframe and use it to construct the linear regression model

	data_lm = pd.DataFrame(regularized_energy_consumptions.consumption.copy())
	data_lm.columns = ["y"]

	# Add previous lags indicated in "lags"
	data_lm = add_lags(data_lm, lags)

	# Categorical features:

	# if the measurement is in the central part of a working day or in a sunday
	data_lm = add_categorical_workday(data_lm)
	# if the measurement is in the month from may to august
	data_lm = add_categorical_months_summer(data_lm)

	return data_lm


	def steepness_anomaly_proof_regression(regression_model, regularized_energy_consumptions, energy_consumptions, data_lm, model_week, lags,total_weeks,data_per_week = 672):

	"""
	Construction of the regression model for the whole time series.
	Using the parameters optimized by the training phase of linear regression model, the prediction is made for every point, if this prediction is far from the real value, the point is classified as anomalous. When the prediction is done, if a previous point that has to be used as lag is anomalous, the predicted value is considered and not the real one.
	The way in which the anomaly check is computed takes care of in which part of the day the measurement is taken.
	:param model regression_model: model given by the linear regression training
	:param DataFrame energy_consumptions: formatted DataFrame containing measurements of energy consumption.
	Index is of type DatetimeIndex,
	column 'consumption' contains the measurement,
	column 'week' contains the week code of the measurement.
	:param DataFrame data_lm: DataFrame containing in the first column the real measurements and in the other columns the fetures used for regression
	:param vector model_week: model week computed using the median of the normal weeks
	:param vector lags: indicates which previous measurament are used as features for the regression model
	:param int total_weeks: number of weeks in the dataset
	:param int data_per_week: number of measurement for every week (default value is 672, a measurement every 15 minutes)
	:output DataFrame hardcoded: DataFrame containing the constructed model
	Index is of type DatetimeIndex,
	column 'consumption' contains the measurement,
	column 'model' contains the prediction made by the model
	column 'anomaly' contains a bool indicating if the point is considered as anomalous
	"""
	# new dataset used for forecasting and anomaly detection
	hardcoded_regression = pd.DataFrame(energy_consumptions.consumption)
	hardcoded_regression['model'] = np.hstack((np.tile(model_week, 4), np.zeros(data_per_week*(total_weeks - 4))))
	hardcoded_regression['anomaly'] = False
	hardcoded_regression['atypical day'] = regularized_energy_consumptions['atypical day']


	# "threshold" is used to detect anomalies in the central parts of the day, between 8a.m. and 7p.m.
	# Anomalies in the "head and tail" of each day are detected with a different criterion, based on relative error
	# that depends on "threshold_steep"
	# This double standard proved to be very convenient, since the "head and tail" section of each day is very steep.
	threshold = compute_anomaly_threshold_central_part_of_day(data_lm)
	threshold_steep = 0.8


	# First 4 weeks of the time series cannot be predicted by linear regression model since
	# the prevoius lags are not available: model week is replaced instead of the model
	real_values = hardcoded_regression['consumption'].iloc[0:data_per_week*4]
	model_values = hardcoded_regression['model']

	# anomaly detection for first 4 weeks
	for row_index in range(data_per_week*4):
	time_of_the_day = (row_index%672)%96
	hardcoded_regression = detect_anomaly(threshold, threshold_steep, time_of_the_day, row_index, hardcoded_regression,model_values[row_index], real_values[row_index])



	# Step by step prediction avoiding to use anomalous values:
	# starting from 5th week until the end of the dataset

	for row_index in range(4data_per_week, total_weeksdata_per_week):
	data_point = np.zeros(data_lm.columns.values.shape[0] - 1)
	features_mask_bool = hardcoded_regression.iloc[row_index-lags, 2].values
	features_mask_columns = np.zeros(features_mask_bool.shape[0])

	# Check if the lags used as features to predict the next point are anomalous,
	# if that is the case, the previous predicted model values are taken
	# as features for that measuraments
	for i in range(0, features_mask_bool.shape[0]):
	if features_mask_bool[i] == True:
	data_point[i] = hardcoded_regression.iloc[row_index - lags[i], 1]
	if features_mask_bool[i] == False:
	data_point[i] = hardcoded_regression.iloc[row_index - lags[i], 0]
	# Prediction using the trained linear model, saving it in hardcoded_regression dataset
	prediction = regression_model.predict(data_point.reshape(1, -1))
	hardcoded_regression.iloc[row_index, 1] = prediction


	# Detect if the predicted point is anomalous, evaluating it in different way
	# respect to the time of the day in which the measurement is taken
	time_of_the_day=(row_index%672)%96
	hardcoded_regression = detect_anomaly(threshold, threshold_steep, time_of_the_day, row_index, hardcoded_regression,prediction, hardcoded_regression.iloc[row_index,0])

	return hardcoded_regression


	def detect_anomaly(threshold, threshold_steep, time_of_the_day, row_index, hardcoded_regression, prediction, real):

	"""
	Detection of anomalies in real values of time series, using two different
	criterion for central part of the day and remaining part.

	:param double threshold : used for the central part
	:param double threshold_steep : used for begin and end of the day
	:param int time_of_day: indicate in which part of the day the measurement is taken
	:param int row_index: row index of the time series corresponding to the measurement to analyse
	:param,output DataFrame hardcoded_regression: DataFrame containing the constructed model
	Index is of type DatetimeIndex,
	column 'consumption' contains the measurement,
	column 'model' contains the prediction made by the model
	column 'anomaly' contains a bool indicating if the point is considered as anomalous
	column 'atypical day' contains a bool indicating if that measurament
	is part of a day detected as atypical in the first phase of the analysis.
	:param double prediction: value predicted by the linear model
	:param double real: actual value of the time series
	:output vector data_point: regularized features vector for the considered point
	"""
	absolute = np.abs(prediction - real)
	relative = absolute/prediction

	if (time_of_the_day > -1 and time_of_the_day < 32) or (time_of_the_day > 76):
	if relative > threshold_steep:
	hardcoded_regression.iloc[row_index, 2] = True
	elif absolute > threshold:
	hardcoded_regression.iloc[row_index, 2] = True

	return hardcoded_regression


	def compute_anomaly_threshold_central_part_of_day(data_lm,total_weeks=98,data_per_week = 672):
	"""
	This function computes autmoatically a threshold for each building. Such threshold is then used to detect if measurements
	in the central part of the day (opening hours basically) are anomalous or not. For each opening day, the std of the
	measurements during the opening hours is computed. This is done for each day between Monday and Saturday of the data.
	The computed std values are then averaged.
	For the "head and tail" of each day, a different threshold is used, set in the function steepness_anomaly_proof_regression.
	:param DataFrame data_lm: DataFrame containing target and features that will be
	used for linear regression.
	Index is of type DatetimeIndex,
	column 'y' contains the targets, i.e. the value that
	the time series ssume in each point
	the remaining column contains different typed of features.
	:param int total_weeks: number of weeks in the dataset (default value is 98)
	:param int data_per_week: number of measurement for every week (default value is 672, a measurement every 15 minutes)
	:param int data_per_day: number of measurement for every day (default value is 96, a measurement every 15 minutes)
	:output vector data_point: regularized features vector for the considered point
	"""

	std_vector = np.zeros(total_weeks*6)
	central_measurements_of_day = []

	for row_index in range(0, (total_weeks - 4)*data_per_week):
	if(row_index%672 == 0):
	#this matrix contains the measurements of the central part of the days from Monday to Saturday of the week currently #being analysed. It's initialized to zeros at the start of each new week.
	current_week_central_measurements_of_days = np.zeros((45,6))
	day_index = (row_index//96)%7
	if (day_index != 6):
	time_of_the_day = (row_index%672)%96
	#If the time is between 8 a.m. and 7 p.m., the measurements are kept in a vector. Their std is then computed.
	if (time_of_the_day >= 32 and time_of_the_day <= 76):
	current_week_central_measurements_of_days[time_of_the_day - 32,day_index]=data_lm.iloc[row_index,0]
	if (row_index%671 == 0):
	std_days_current_week = np.std(current_week_central_measurements_of_days, axis = 0)
	week_index = row_index//672
	std_vector[week_index6:(week_index+1)6] = std_days_current_week
	threshold = np.mean(std_vector)
	return 1.96*threshold



	def add_lags(data_lm, lags):
	"""
	Add the indicated lags as features of the target variable,
	contained in the y column of data_lm

	:param vector lags: indicates which previous measurament are used as features for the regression model
	:param,output DataFrame data_lm: DataFrame containing target and features that will be
	used for linear regression.
	Index is of type DatetimeIndex,
	column 'y' contains the targets, i.e. the value that
	the time series ssume in each point
	the remaining column contains different typed of features.
	"""



	for i in lags:
	data_lm["lag_{}".format(i)] = data_lm.y.shift(i)
	data_lm = data_lm.iloc[max(lags):]
	return data_lm


	def add_categorical_day(data_lm, data_per_week = 672, data_per_day = 96):
	"""
	Categorical features representing the day of the week are added

	:param int data_per_week: number of measurement for every week (default value is 672, a measurement every 15 minutes)
	:param int data_per_day: number of measurement for every day (default value is 96, a measurement every 15 minutes)
	:param,output DataFrame data_lm: DataFrame containing target and features that will be
	used for linear regression.
	Index is of type DatetimeIndex,
	column 'y' contains the targets, i.e. the value that
	the time series ssume in each point
	the remaining column contains different typed of features.
	"""



	categorical_day = np.zeros((data_lm.shape[0],7))
	for i in range(categorical_day.shape[0]):
	categorical_day[i,(i%data_per_week)//data_per_day] = 1
	for i in range(7):
	data_lm["day_{}".format(i+1)] = categorical_day[:,i]

	return data_lm

	def add_categorical_central_part(data_lm, data_per_week = 672, data_per_day = 96):
	"""
	Categorical features describing if the measurement is in the central part of
	the day are added

	:param int data_per_week: number of measurement for every week (default value is 672, a measurement every 15 minutes)
	:param int data_per_day: number of measurement for every day (default value is 96, a measurement every 15 minutes)
	:param,output DataFrame data_lm: DataFrame containing target and features that will be
	used for linear regression.
	Index is of type DatetimeIndex,
	column 'y' contains the targets, i.e. the value that
	the time series ssume in each point
	the remaining column contains different typed of features.
	"""

	categorical_day_central = np.zeros((data_lm.shape[0],7))
	for i in range(categorical_day_central.shape[0]):
	time_of_day=(i%data_per_week)%data_per_day
	if(time_of_day>28 and time_of_day<80):
	categorical_day_central[i,(i%data_per_week)//data_per_day] = 1

	for i in range(7):
	data_lm["central_part_day_{}".format(i+1)] = categorical_day_central[:,i]

	return data_lm

	def add_categorical_workday(data_lm, data_per_week = 672, data_per_day = 96):
	"""
	Categorical features describing if the measuremente is in the central
	part of a working day or in a Sunday

	:param int data_per_week: number of measurement for every week (default value is 672, a measurement every 15 minutes)
	:param int data_per_day: number of measurement for every day (default value is 96, a measurement every 15 minutes)
	:param,output DataFrame data_lm: DataFrame containing target and features that will be
	used for linear regression.
	Index is of type DatetimeIndex,
	column 'y' contains the targets, i.e. the value that
	the time series ssume in each point
	the remaining column contains different typed of features.
	"""
	categorical_working_day_central = np.zeros(data_lm.shape[0])
	for i in range(categorical_working_day_central.shape[0]):
	time_of_day=(i%data_per_week)%data_per_day
	day_of_week=(i//data_per_day)%7

	if(time_of_day>28 and time_of_day<80):
	if(day_of_week != 6):
	categorical_working_day_central[i] = 1
	data_lm["central_part_working_day"] = categorical_working_day_central

	return data_lm

	def add_categorical_month(data_lm, data_per_week = 672, data_per_day = 96):
	"""
	Categorical features representing the month of the year are added

	:param int data_per_week: number of measurement for every week (default value is 672, a measurement every 15 minutes)
	:param int data_per_day: number of measurement for every day (default value is 96, a measurement every 15 minutes)
	:param,output DataFrame data_lm: DataFrame containing target and features that will be
	used for linear regression.
	Index is of type DatetimeIndex,
	column 'y' contains the targets, i.e. the value that
	the time series ssume in each point
	the remaining column contains different typed of features.
	"""
	categorical_months = np.zeros((data_lm.shape[0],12))
	for i in range(data_lm.shape[0]):
	month=((i//2688)+1)%12
	time_of_day=(i%data_per_week)%data_per_day
	if(time_of_day>28 and time_of_day<80):
	if(month == 0):
	categorical_months[i,0] = 1
	else:
	categorical_months[i,month] = 1

	# Add only categorical month from May to August
	for i in range(1, 12):
	data_lm["month_{}".format(i+1)] = categorical_months[:,i]
	return data_lm

	def add_categorical_months_summer(data_lm, data_per_week = 672, data_per_day = 96):
	"""
	Categorical features representing the month of the year are added only
	for the months from May to August

	:param int data_per_week: number of measurement for every week (default value is 672, a measurement every 15 minutes)
	:param int data_per_day: number of measurement for every day (default value is 96, a measurement every 15 minutes)
	:param,output DataFrame data_lm: DataFrame containing target and features that will be
	used for linear regression.
	Index is of type DatetimeIndex,
	column 'y' contains the targets, i.e. the value that
	the time series ssume in each point
	the remaining column contains different typed of features.
	"""
	categorical_months = np.zeros((data_lm.shape[0],12))
	for i in range(data_lm.shape[0]):
	month=((i//2688)+1)%12
	time_of_day=(i%data_per_week)%data_per_day
	if(time_of_day>28 and time_of_day<80):
	if(month == 0):
	categorical_months[i,0] = 1
	else:
	categorical_months[i,month] = 1

	# Add only categorical month from May to August
	for i in range(5, 9):
	data_lm["month_{}".format(i+1)] = categorical_months[:,i]
	return data_lm


	def timeseries_train_test_split(X, y, test_size):
	"""
	Perform train-test split with respect to time series structure
	param: matrix X: data regression matrix
	param: vector y: targets
	param: int test_size: percentage of the data used for testing
	"""

	# get the index after which test set starts
	test_index = int(len(X)*(1-test_size))

	X_train = X.iloc[:test_index]
	y_train = y.iloc[:test_index]
	X_test = X.iloc[test_index:]
	y_test = y.iloc[test_index:]

	return X_train, X_test, y_train, y_test

helpers_linear_regression.pyNo OneTemporaryActions

File Metadata

helpers_linear_regression.pyView Options

Event Timeline

helpers_linear_regression.py
No OneTemporary
Actions

helpers_linear_regression.py
View Options