Page MenuHomec4science

cluster_pv.py
No OneTemporary

File Metadata

Created
Fri, Jul 4, 14:42

cluster_pv.py

import numpy as np
import pandas as pd
import os
import sys
import time
from sklearn.cluster import DBSCAN
#### Variable definitions
## INPUTS
INFILE = sys.argv[1]
OUTFILE = sys.argv[2]
# Optional inputs
if len(sys.argv) > 3:
MIN_SIZE = float(sys.argv[3]) # Minimum size of installations (in m2)
else:
MIN_SIZE = 2.5
# DBSCAN Hyperparameters: see https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
if len(sys.argv) > 5:
EPS = float(sys.argv[4])
MIN_SAMPLE = int(sys.argv[5])
else:
EPS = 0.72 # Accept 2 missing pixels in x, y, or diagonal direction
MIN_SAMPLE = 10 # Minimum 10 samples for "core points"
# Constants:
COORDS = ['x', 'y']
print('\nLoad pixel data from %s' %INFILE)
print('Minimum installation size: %.1f m2' %MIN_SIZE)
print('DBSCAN parameters: EPS = %.2f, MIN_SAMPLE = %d' %(EPS, MIN_SAMPLE))
print('Save output to %s\n' %OUTFILE)
#### Load data
all_pixels = pd.read_csv( INFILE )
#### Perform clustering
tt = time.time()
all_pixels['cluster_ID'] = DBSCAN(eps=EPS, min_samples=MIN_SAMPLE, n_jobs=-1).fit_predict(all_pixels[ COORDS ])
print('Clustered data in %.2fs' %(time.time()-tt))
# Exclude all those without an assigned cluster
PV_clusters = all_pixels[all_pixels.cluster_ID >= 0]
# Get those clusters with a valid size
PV_cluster_area = PV_clusters.groupby('cluster_ID').sum().pixel_area
valid_clusters = PV_cluster_area[ PV_cluster_area >= MIN_SIZE ]
clustered_pixels = all_pixels[all_pixels.cluster_ID.isin( valid_clusters.index.values )]
print( clustered_pixels.head() )
# Save output
clustered_pixels.to_csv( OUTFILE, index = False )
print('Saved %s - DONE' %OUTFILE)

Event Timeline