"/Applications/anaconda2/envs/py3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n"
"/Applications/anaconda2/envs/py3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n"
+ "#### TRAINING OF ALL ESTIMATORS - using cross-validation"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Validation MSE: 0.020185\n"
+ "\n",
+ "Training KNN\n",
+ "\n",
+ "CPU time: 1.45 seconds\n",
+ "Wall clock time: 0.70 seconds\n",
+ "Cross-validation MSE: 0.024183\n",
+ "Cross-validation RMSE: 0.155509\n",
+ "Cross-validation MAE: 0.118705\n",
+ "Cross-validation MBE: -0.000424\n",
+ "Cross-validation R2: 0.493477\n",
+ "\n",
+ "Training LIN\n",
+ "\n",
+ "CPU time: 0.10 seconds\n",
+ "Wall clock time: 0.06 seconds\n",
+ "Cross-validation MSE: 0.018053\n",
+ "Cross-validation RMSE: 0.134360\n",
+ "Cross-validation MAE: 0.105472\n",
+ "Cross-validation MBE: -0.000001\n",
+ "Cross-validation R2: 0.621882\n",
+ "\n",
+ "Training RF\n",
+ "\n",
+ "CPU time: 1068.10 seconds\n",
+ "Wall clock time: 287.64 seconds\n",
+ "Cross-validation MSE: 0.014735\n",
+ "Cross-validation RMSE: 0.121387\n",
+ "Cross-validation MAE: 0.091532\n",
+ "Cross-validation MBE: 0.002513\n",
+ "Cross-validation R2: 0.691376\n",
+ "\n",
+ "Training XGB\n",
+ "\n",
+ "CPU time: 158.91 seconds\n",
+ "Wall clock time: 159.39 seconds\n",
+ "Cross-validation MSE: 0.015186\n",
+ "Cross-validation RMSE: 0.123233\n",
+ "Cross-validation MAE: 0.092057\n",
+ "Cross-validation MBE: 0.000883\n",
+ "Cross-validation R2: 0.681918\n",
+ "\n",
+ "Training ELM\n",
+ "\n",
+ "CPU time: 55.45 seconds\n",
+ "Wall clock time: 26.13 seconds\n",
+ "Cross-validation MSE: 0.017579\n",
+ "Cross-validation RMSE: 0.132585\n",
+ "Cross-validation MAE: 0.103028\n",
+ "Cross-validation MBE: 0.000073\n",
+ "Cross-validation R2: 0.631807\n",
+ "\n",
+ "Training MLP\n",
+ "\n",
+ "CPU time: 338.23 seconds\n",
+ "Wall clock time: 129.73 seconds\n",
+ "Cross-validation MSE: 0.022509\n",
+ "Cross-validation RMSE: 0.150029\n",
+ "Cross-validation MAE: 0.117700\n",
+ "Cross-validation MBE: -0.003906\n",
+ "Cross-validation R2: 0.528548\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Applications/anaconda2/envs/py3/lib/python3.6/site-packages/ipykernel_launcher.py:16: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\u001b[0;32m/Applications/anaconda2/envs/py3/lib/python3.6/site-packages/sklearn/ensemble/forest.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 326\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrees\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m verbose=self.verbose, class_weight=self.class_weight)\n\u001b[0;32m--> 328\u001b[0;31m for i, t in enumerate(trees))\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[0;31m# Collect newly grown trees\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/Applications/anaconda2/envs/py3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 787\u001b[0m \u001b[0;31m# consumption.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 789\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 790\u001b[0m \u001b[0;31m# Make sure that we get a last message telling us we are done\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 791\u001b[0m \u001b[0melapsed_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_start_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"/Applications/anaconda2/envs/py3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n"
"/Applications/anaconda2/envs/py3/lib/python3.6/site-packages/numpy/lib/arraysetops.py:472: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
" mask |= (ar1 == a)\n"
]
}
],
"source": [
"## === MERGE INFORMATION OF ROOFS & SHADING ==\n",
"rooftops_filled.loc[replacement_idx, :] = rooftops_with_missing_data.iloc[:,1:-1].values # NEED .values here as otherwise matching is performed by index - so NaNs are introduced"
"# Merge roofs with shading info and roofs with panelled area info"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 1: get roofs for training set\n",
"By merging the roofs_w_shade and roofs_w_panels, we get a dataframe that contains all rooftops for which 50cm shading information is available, and who's footprint overlaps with the SITG roof information, for which superstructure information is available. To complement the superstructure information, all roof surfaces smaller than 8m^2 in SITG were also categorized as superstructures."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"training_roofs = pd.merge(GVA_roofs_w_shade, GVA_roof_corners, how = 'inner', on = 'DF_UID')"
"Add information of panelled area for Sonnendach roof polygons (ftr) and roof polygons with excluded superstructures (tgt). Note that CH_panels_ftr contains ALL rooftops and their panel_count information. For all roofs where panel_count == 0, we know already that there cannot be any panels fitted (due to geometry/size), neither vertically nor horizontally, so any calculation of available area for these rooftops is obsolete, i.e. the panels are excluded from further analysis."
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"# exclude all roofs that cannot fit even a single panel on the roof polygons (neither vertical nor horizontal)\n",
"### Step 3: add shading information (shaded area only)\n",
"The shading values (shaded_area_ratio) based on the 2m DOM are added as features, while the shading based on the 50cm resolution DOM are used to calculate the labels (they are assumed to represent the ground truth for shading). For all roofs for which the 2m DOM shading information is not available, the mean values as given in the CH_shade_cats are used."
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"# add shading features\n",
"training_roofs_shade_ftr = training_roofs_panel_tgt.merge(GVA_shade_ftr.loc[:,['DF_UID', 'fully_shaded_ratio']], how = 'left', on = 'DF_UID')\n",
+ "# Merge roofs with shading info and roofs with panelled area info"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Step 1: get roofs for training set\n",
+ "By merging the roofs_w_panels and the area of the gva_shp, we get a dataframe that contains all rooftops who's footprint overlaps with the SITG roof information for which superstructure information is available (all roofs where area_with_SP == area_without_SP are excluded!!!). To complement the superstructure information, all roof surfaces smaller than 8m^2 in SITG were also categorized as superstructures."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "training_roofs = pd.merge(GVA_roofs_all, GVA_roof_corners, how = 'inner', on = 'DF_UID')"
+ "Add information of panelled area for Sonnendach roof polygons (ftr) and roof polygons with excluded superstructures (tgt). Note that CH_panels_ftr contains ALL rooftops and their panel_count information. For all roofs where panel_count == 0, we know already that there cannot be any panels fitted (due to geometry/size), neither vertically nor horizontally, so any calculation of available area for these rooftops is obsolete, i.e. the panels are excluded from further analysis."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# merge panelled area ratio for features (no superstructure excluded)\n",
+ "training_roofs_panel_ftr = training_roofs.merge(CH_panels_ftr.loc[:,['DF_UID', 'panel_tilt', 'best_align', 'panelled_area_ratio']], how = 'left', on = 'DF_UID')\n",
+ "### Step 3: Exclude roofs without superstructure data\n",
+ "The shapefiles of GVA 6 GVA_noSP can be use to check if the area was changed when superstructures were subtracted. In the cases where this was not the case, the respective roofs are removed from training - it is assumed that the dataset is incomplete and superstructures were not correctly registered rather than there are no superstructures that can be found on the roof. This may be a conservative estimate in some cases, which however is preferred to an overestimate. Uncertainty is to be quantified to assess the level of confidence"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# extract area information from shapefiles\n",
+ "gva_all = gva_shp.loc[:,['DF_UID']]\n",
+ "gva_all['area_full'] = gva_shp.area\n",
+ "\n",
+ "gva_noSP = gva_shp_noSP.loc[:,['DF_UID']]\n",
+ "gva_noSP['area_noSP'] = gva_shp_noSP.area"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Merge information of areas wih and without superstructures; areas not found in gva_noSP are given area = 0\n",
+ "gva_data = gva_all.merge(gva_noSP, on = 'DF_UID', how = 'left')\n",