Page MenuHomec4science

DBSCAN_hyperparams.py
No OneTemporary

File Metadata

Created
Sat, May 11, 14:29

DBSCAN_hyperparams.py

import pandas as pd
import numpy as np
import xarray as xr
import geopandas as gpd
from sklearn.cluster import DBSCAN
import os
import time
import calendar
coords = ['x', 'y']
scratch_fp = '/scratch/walch/cluster_hubs'
# Load data
data = pd.read_csv(os.path.join(scratch_fp, 'BLD_data.csv'))
demand = data[data.annual_demand_kWh > 0]
cluster_data = demand[demand.peak_demand_W < 1e6]
print(cluster_data.head())
print(len(cluster_data))
EPS_vec = np.arange(50, 750, 50)
min_samples_vec = np.arange(5, 105, 5)
stats = []
for EPS in EPS_vec:
for MIN_SAMPLES in min_samples_vec:
print('EPS = %d; min_samples = %d' %(EPS, MIN_SAMPLES))
model = DBSCAN(eps = EPS, min_samples = MIN_SAMPLES, n_jobs = -1)
tt = time.time()
cluster_data['clusterID'] = model.fit_predict( cluster_data[coords] )
cluster_time = time.time()-tt
print('Fit and predict in %.2fs' %(cluster_time))
# Prepare pd Series with stats
cluster_groups = cluster_data[cluster_data.clusterID > -1].groupby('clusterID')['peak_demand_W']
peak_demand_kW = cluster_groups.sum() / 1e3
large_clusters = peak_demand_kW[(peak_demand_kW > 1000) & (peak_demand_kW.index >= 0)]
count_stats = cluster_groups.count().describe().iloc[1:]
kWp_stats = peak_demand_kW.describe().iloc[1:]
count_stats.index = ['N_BLD_%s' %name for name in count_stats.index]
kWp_stats.index = ['kWp_%s' %name for name in kWp_stats.index ]
other_info = pd.Series( { 'EPS' : EPS ,
'min_samples': MIN_SAMPLES,
'exec_time' : cluster_time,
'N_clusters' : len(peak_demand_kW)-1,
'N_clust_>1MWp' : len(peak_demand_kW[peak_demand_kW > 1000]),
'perc_outliers' : 100*len(cluster_data[cluster_data.clusterID == -1])/len(cluster_data),
'aprx_add_clst' : large_clusters.sum() / 1e3 })
stats.append( pd.concat([other_info, kWp_stats, count_stats]) )
stats_all = pd.DataFrame(stats)
print(stats_all.head())
stats_all.to_csv('/scratch/walch/scratch/DBSCAN_hyperparams_GWRonly.csv', index = False)

Event Timeline