diff --git a/output/bay_area/data_distribution.ipynb b/output/bay_area/data_distribution.ipynb index 5eb9b31..151a912 100644 --- a/output/bay_area/data_distribution.ipynb +++ b/output/bay_area/data_distribution.ipynb @@ -1,1316 +1,1449 @@ { "cells": [ { "cell_type": "code", - "execution_count": 104, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from scipy.stats import gmean" ] }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Numbers of compound in soil: 889\n", "Numbers of compound in sludge: 160\n", "Find overlap cpds: 27\n", "Numbers of compound in combined: 1022\n" ] } ], "source": [ "# input_file_path = \"C:\\\\Users\\\\leetseng\\\\TWtest\\\\input\\\\\"\n", "output_path = \"C:\\\\Users\\\\leetseng\\\\TWtest\\\\output\\\\bay_area\\\\\"\n", "\n", "#This files already contain the bay mean and bay std.\n", "#The reduced SMILES of previous files were generated from other toolkit. This file lead to inconsistant of SMILES.\n", "file_name_soil = \"soil_model_input_full.txt\" #soil_model_input_bayes_curated_full.txt\n", "file_name_sludge = \"sludge_bay_PriorMuStd_3.tsv\" #sludge_bayesian_PriorMuStd_09.tsv\n", "df_soil_original = pd.read_csv(output_path + file_name_soil, sep='\\t')\n", "df_sludge_original = pd.read_csv(output_path + file_name_sludge, sep='\\t')\n", "\n", "#Use the copy of dataframe for the later test and manipulation\n", "df_soil_ = df_soil_original.copy()\n", "df_sludge_ = df_sludge_original.copy()\n", "df_sludge_.loc[:, 'package'] = 0 # O: sludge, 1: soil\n", "df_soil_.loc[:, 'package'] = 1\n", "\n", "list_of_reduced_smiles_soil = df_soil_['reduced_smiles'].values.tolist() #'canonicalize_smiles'\n", "set_of_reduced_smiles_soil = set(list_of_reduced_smiles_soil)\n", "print(\"Numbers of compound in soil: \", len(set_of_reduced_smiles_soil))\n", "\n", "list_of_reduced_smiles_sludge = df_sludge_['reduced_smiles'].values.tolist() #'canonicalize_smiles'\n", "set_of_reduced_smiles_sludge = set(list_of_reduced_smiles_sludge)\n", "print(\"Numbers of compound in sludge: \", len(set_of_reduced_smiles_sludge))\n", "\n", "print(\"Find overlap cpds:\", len(set_of_reduced_smiles_sludge & set_of_reduced_smiles_soil))\n", "\n", "df_combined_ = pd.concat([df_sludge_, df_soil_], join=\"inner\")\n", "list_of_reduced_smiles = df_combined_['reduced_smiles'].values.tolist() #'canonicalize_smiles'\n", "set_of_reduced_smiles_combined = set(list_of_reduced_smiles)\n", "print(\"Numbers of compound in combined: \", len(set_of_reduced_smiles_combined))" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hl_log_stdhl_log_medianhl_log_gmeanhl_log_bayesian_mean
count107.000000160.000000160.000000160.000000
mean0.3183330.4444100.4207060.456062
std0.9232511.0775891.0753461.278536
min-1.951180-1.636296-1.636296-2.290000
25%-0.094386-0.325918-0.366208-0.395000
50%0.4906930.5671970.5340770.540000
75%0.8536491.1655671.1251501.212500
max2.6461963.0169173.0169173.610000
\n", "
" ], "text/plain": [ " hl_log_std hl_log_median hl_log_gmean hl_log_bayesian_mean\n", "count 107.000000 160.000000 160.000000 160.000000\n", "mean 0.318333 0.444410 0.420706 0.456062\n", "std 0.923251 1.077589 1.075346 1.278536\n", "min -1.951180 -1.636296 -1.636296 -2.290000\n", "25% -0.094386 -0.325918 -0.366208 -0.395000\n", "50% 0.490693 0.567197 0.534077 0.540000\n", "75% 0.853649 1.165567 1.125150 1.212500\n", "max 2.646196 3.016917 3.016917 3.610000" ] }, - "execution_count": 49, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def non_nan_mean(x):\n", " if x.empty: \n", " return None\n", " else:\n", " x = x.dropna()\n", " return np.mean(x)\n", "\n", "\n", "def get_subset_target_variable(Y): # des_X: df of descriptor, Y: df_soil or df_sludge\n", " Y_ = Y.copy()\n", " Y_.loc[:, 'package'] = 1\n", " aggs = [non_nan_mean]\n", " df_subset_Y = Y_[[\"reduced_smiles\", \"hl_log_std\", \"hl_log_median\", \"hl_log_gmean\", \"hl_log_bayesian_mean\"]] # \"temperature\", \"log_hl_combined\", \"log_hl_biomass_corrected\"\n", " Y_group = df_subset_Y.groupby([\"reduced_smiles\"]).agg(aggs).reset_index()\n", " Y_group.columns = Y_group.columns.droplevel(1)\n", " return Y_group\n", "\n", "sludge_dist = get_subset_target_variable(df_sludge_)\n", "sludge_dist.describe()" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hl_log_stdhl_log_medianhl_log_gmeanhl_log_bayesian_mean
count889.000000889.000000889.000000889.000000
mean0.2794841.2252681.2310871.225040
std0.1926070.9177590.9104090.961831
min0.000000-2.721103-2.674359-2.250000
25%0.1517450.6812410.7153760.720000
50%0.2564141.3404441.3533511.350000
75%0.3799631.8949921.8832891.880000
max1.9792713.9999573.9999574.380000
\n", "
" ], "text/plain": [ " hl_log_std hl_log_median hl_log_gmean hl_log_bayesian_mean\n", "count 889.000000 889.000000 889.000000 889.000000\n", "mean 0.279484 1.225268 1.231087 1.225040\n", "std 0.192607 0.917759 0.910409 0.961831\n", "min 0.000000 -2.721103 -2.674359 -2.250000\n", "25% 0.151745 0.681241 0.715376 0.720000\n", "50% 0.256414 1.340444 1.353351 1.350000\n", "75% 0.379963 1.894992 1.883289 1.880000\n", "max 1.979271 3.999957 3.999957 4.380000" ] }, - "execution_count": 42, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soil_dist = get_subset_target_variable(df_soil_)\n", "soil_dist.describe()" ] }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\2105885712.py:8: UserWarning: \n", + "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_4072\\2105885712.py:8: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " ax1 = sns.distplot(soil_dist['hl_log_std'], color=\"#a6a871\", ax=ax[1,0])\n", - "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\2105885712.py:16: UserWarning: \n", + "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_4072\\2105885712.py:16: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " ax2 = sns.distplot(soil_dist['hl_log_median'], color=\"#a6a871\", ax=ax[1,1])\n", - "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\2105885712.py:23: UserWarning: \n", + "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_4072\\2105885712.py:23: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " ax3 = sns.distplot(soil_dist['hl_log_gmean'], color=\"#a6a871\", ax=ax[1,2])\n", - "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\2105885712.py:30: UserWarning: \n", + "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_4072\\2105885712.py:30: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " ax4 = sns.distplot(soil_dist['hl_log_bayesian_mean'], color=\"#a6a871\", ax=ax[1,3]) # biomass_hl_log_gmean\n", - "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\2105885712.py:38: UserWarning: \n", + "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_4072\\2105885712.py:38: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " ax5 = sns.distplot(sludge_dist['hl_log_std'], color=\"#5f8c63\", ax=ax[0,0]) # hl_log_gmean , fit=norm\n", - "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\2105885712.py:46: UserWarning: \n", + "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_4072\\2105885712.py:46: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " ax6 = sns.distplot(sludge_dist['hl_log_median'], color=\"#5f8c63\",ax=ax[0,1])\n", - "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\2105885712.py:52: UserWarning: \n", + "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_4072\\2105885712.py:52: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " ax7 = sns.distplot(sludge_dist['hl_log_gmean'], color=\"#5f8c63\",ax=ax[0,2])\n", - "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\2105885712.py:58: UserWarning: \n", + "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_4072\\2105885712.py:58: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " ax8 = sns.distplot(sludge_dist['hl_log_bayesian_mean'], color=\"#5f8c63\",ax=ax[0,3]) # biomass_hl_log_gmean\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#Sludge target variable data distribution\n", "#Candidates: halflife_log, hl_biomass_corrected, log_hl_biomass_corrected, hl_log_gmean, hl_log_median, biomass_hl_log_gmean, biomass_hl_log_median, hl_log_bayesian_mean \n", "fig, ax = plt.subplots(2,4, sharex=True)\n", "width, height = fig.get_size_inches()\n", "fig.set_size_inches(width*3, height*1.5)\n", "\n", "\n", "ax1 = sns.distplot(soil_dist['hl_log_std'], color=\"#a6a871\", ax=ax[1,0])\n", "ax1.set(xlabel='std log(DT$_{50}$)') \n", "ax1.axvline(soil_dist['hl_log_std'].mean(), label='mean', linestyle='-.', color='r')\n", "ax1.axvline(soil_dist['hl_log_std'].median(), label='median', linestyle='-.', color='b')\n", "ax1.grid(False)\n", "ax1.legend()\n", "plt.text(x=-22, y=1.27, s='Eawag-Sludge Package', fontsize=16, weight='bold')\n", "\n", "ax2 = sns.distplot(soil_dist['hl_log_median'], color=\"#a6a871\", ax=ax[1,1])\n", "ax2.set(xlabel='median log(DT$_{50}$)') \n", "ax2.axvline(soil_dist['hl_log_median'].mean(), label='mean', linestyle='-.', color='r')\n", "ax2.axvline(soil_dist['hl_log_median'].median(), label='median', linestyle='-.', color='b')\n", "ax2.grid(False)\n", "# ax2.legend()\n", "\n", "ax3 = sns.distplot(soil_dist['hl_log_gmean'], color=\"#a6a871\", ax=ax[1,2])\n", "ax3.set(xlabel='gmean log(DT$_{50}$)') \n", "ax3.axvline(soil_dist['hl_log_gmean'].mean(), label='mean', linestyle='-.', color='r')\n", "ax3.axvline(soil_dist['hl_log_gmean'].median(), label='median', linestyle='-.', color='b')\n", "ax3.grid(False)\n", "# ax3.legend()\n", "\n", "ax4 = sns.distplot(soil_dist['hl_log_bayesian_mean'], color=\"#a6a871\", ax=ax[1,3]) # biomass_hl_log_gmean\n", "ax4.set(xlabel='bayesian mean log(DT$_{50}$)') \n", "ax4.axvline(soil_dist['hl_log_bayesian_mean'].mean(), label='mean', linestyle='-.', color='r')\n", "ax4.axvline(soil_dist['hl_log_bayesian_mean'].median(), label='median', linestyle='-.', color='b')\n", "ax4.grid(False)\n", "# ax4.legend()\n", "\n", "\n", "ax5 = sns.distplot(sludge_dist['hl_log_std'], color=\"#5f8c63\", ax=ax[0,0]) # hl_log_gmean , fit=norm\n", "ax5.set(xlabel='std log(DT$_{50}$)') \n", "ax5.axvline(sludge_dist['hl_log_std'].mean(), label='mean', linestyle='-.', color='r')\n", "ax5.axvline(sludge_dist['hl_log_std'].median(), label='median', linestyle='-.', color='b')\n", "ax5.grid(False)\n", "ax5.legend()\n", "plt.text(x=-21.4, y=0.6, s='Eawag-Soil Package', fontsize=16, weight='bold')\n", "\n", "ax6 = sns.distplot(sludge_dist['hl_log_median'], color=\"#5f8c63\",ax=ax[0,1])\n", "ax6.axvline(sludge_dist['hl_log_median'].mean(), label='mean', linestyle='-.', color='r')\n", "ax6.axvline(sludge_dist['hl_log_median'].median(), label='median', linestyle='-.', color='b')\n", "ax6.grid(False)\n", "# ax6.legend()\n", "\n", "ax7 = sns.distplot(sludge_dist['hl_log_gmean'], color=\"#5f8c63\",ax=ax[0,2])\n", "ax7.axvline(sludge_dist['hl_log_gmean'].mean(), label='mean', linestyle='-.', color='r')\n", "ax7.axvline(sludge_dist['hl_log_gmean'].median(), label='median', linestyle='-.', color='b')\n", "ax7.grid(False)\n", "# ax7.legend()\n", "\n", "ax8 = sns.distplot(sludge_dist['hl_log_bayesian_mean'], color=\"#5f8c63\",ax=ax[0,3]) # biomass_hl_log_gmean\n", "ax8.axvline(sludge_dist['hl_log_bayesian_mean'].mean(), label='mean', linestyle='-.', color='r')\n", "ax8.axvline(sludge_dist['hl_log_bayesian_mean'].median(), label='median', linestyle='-.', color='b')\n", "ax8.grid(False)\n", "# ax8.legend()\n" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", "C:\\Users\\leetseng\\AppData\\Local\\Temp\\ipykernel_20656\\274356051.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
reduced_smileshl_log_std_xhl_log_median_xhl_log_gmean_xhl_log_bayesian_mean_xhl_log_std_yhl_log_median_yhl_log_gmean_yhl_log_bayesian_mean_ylogKochl_log_gmean_y_calculatedhl_log_bayesian_mean_y_calculatedrmse
0C#CCOc1ccc(CCNC(=O)C(OCC#C)c2ccc(Cl)cc2)cc1OC0.387665-0.255707-0.344881-0.390.3002811.7737861.7211931.724.452.7632092.72173.111700
1CC(C)c1ccc(NC(=O)N(C)C)cc11.0868800.8824420.7995210.960.1552471.0433221.0623481.062.001.1455601.29322.212883
2CC(NC(=O)c1cc(Cl)cc(Br)c1NC(=O)c1cc(Br)nn1-c1n...NaN1.2953471.2953472.640.2280782.8048212.9244742.964.984.8499196.08702.687968
3CC1(C(=O)Nc2ccc(O)c(Cl)c2Cl)CCCCC10.7404470.7537750.5371490.540.295061-0.503062-0.536353-0.543.672.7244772.72712.571911
4CCN(CC)C(=O)C(C)Oc1cccc2ccccc12NaN1.8408251.8408251.780.2119602.5909222.5352822.533.243.4551593.39922.411668
5CCNC(=O)C(C)OC(=O)Nc1ccccc1NaN1.8408251.8408251.770.4140130.7341740.9546740.972.362.4959592.43082.218009
6CCNc1nc(Cl)nc(NC(C)(C)C)n10.4725081.2953471.2490311.570.2062562.0453052.0398962.042.321.9079092.20322.067377
\n", "
" ], "text/plain": [ " reduced_smiles hl_log_std_x \\\n", "0 C#CCOc1ccc(CCNC(=O)C(OCC#C)c2ccc(Cl)cc2)cc1OC 0.387665 \n", "1 CC(C)c1ccc(NC(=O)N(C)C)cc1 1.086880 \n", "2 CC(NC(=O)c1cc(Cl)cc(Br)c1NC(=O)c1cc(Br)nn1-c1n... NaN \n", "3 CC1(C(=O)Nc2ccc(O)c(Cl)c2Cl)CCCCC1 0.740447 \n", "4 CCN(CC)C(=O)C(C)Oc1cccc2ccccc12 NaN \n", "5 CCNC(=O)C(C)OC(=O)Nc1ccccc1 NaN \n", "6 CCNc1nc(Cl)nc(NC(C)(C)C)n1 0.472508 \n", "\n", " hl_log_median_x hl_log_gmean_x hl_log_bayesian_mean_x hl_log_std_y \\\n", "0 -0.255707 -0.344881 -0.39 0.300281 \n", "1 0.882442 0.799521 0.96 0.155247 \n", "2 1.295347 1.295347 2.64 0.228078 \n", "3 0.753775 0.537149 0.54 0.295061 \n", "4 1.840825 1.840825 1.78 0.211960 \n", "5 1.840825 1.840825 1.77 0.414013 \n", "6 1.295347 1.249031 1.57 0.206256 \n", "\n", " hl_log_median_y hl_log_gmean_y hl_log_bayesian_mean_y logKoc \\\n", "0 1.773786 1.721193 1.72 4.45 \n", "1 1.043322 1.062348 1.06 2.00 \n", "2 2.804821 2.924474 2.96 4.98 \n", "3 -0.503062 -0.536353 -0.54 3.67 \n", "4 2.590922 2.535282 2.53 3.24 \n", "5 0.734174 0.954674 0.97 2.36 \n", "6 2.045305 2.039896 2.04 2.32 \n", "\n", " hl_log_gmean_y_calculated hl_log_bayesian_mean_y_calculated rmse \n", "0 2.763209 2.7217 3.111700 \n", "1 1.145560 1.2932 2.212883 \n", "2 4.849919 6.0870 2.687968 \n", "3 2.724477 2.7271 2.571911 \n", "4 3.455159 3.3992 2.411668 \n", "5 2.495959 2.4308 2.218009 \n", "6 1.907909 2.2032 2.067377 " ] }, "execution_count": 165, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import math\n", "from sklearn.metrics import mean_squared_error\n", "\n", "df_read_across = pd.merge(sludge_dist, soil_dist, on='reduced_smiles', how='inner')\n", "df_read_across['logKoc'] = [4.45, 2, 4.98, 3.67, 3.24, 2.36, 2.32, 2.08, 3, 2.53, 2.75, 2.4, 2.46, 1.64, 3.74, 3.1, 1.5, 2.11, 1.76, 5.22, 3.93, 2.67, 2.61, 2.43, 2.58, 1.66, 2.51]\n", "df_read_across['hl_log_gmean_y_calculated'] = np.nan\n", "df_read_across['hl_log_bayesian_mean_y_calculated'] = np.nan\n", "df_read_across['rmse'] = np.nan\n", "# MSE = mean_squared_error(df_read_across['hl_log_gmean_x'], df_read_across['hl_log_gmean_y_calculated']) \n", "# rmse = math.sqrt(MSE) \n", "\n", "for i in range(df_read_across.shape[0]):\n", " df_read_across['hl_log_gmean_y_calculated'][i] = 0.92 * df_read_across['hl_log_gmean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", " df_read_across['hl_log_bayesian_mean_y_calculated'][i] = 0.92 * df_read_across['hl_log_bayesian_mean_x'][i] + 1.09 * df_read_across['logKoc'][i] -1.77\n", " df_read_across['rmse'][i] = mean_squared_error(df_read_across['hl_log_bayesian_mean_x'][:i+1], df_read_across['hl_log_bayesian_mean_y_calculated'][:i+1], squared=False)\n", "\n", "\n", "df_read_across.loc[df_read_across['rmse'] > 2]\n", "\n" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 65, "metadata": {}, - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original SMILES:CCNc1nc(Cl)nc(NC(C)(C)C)n1\n", + "Number of tautomers generated: 9\n", + "Tautomers:\n", + "1. CCN=c1[nH]c(Cl)nc(=NC(C)(C)C)[nH]1\n", + "2. CCN=c1nc(Cl)[nH]c(=NC(C)(C)C)[nH]1\n", + "3. CCN=c1nc(Cl)[nH]c(NC(C)(C)C)n1\n", + "4. CCN=c1nc(Cl)nc(NC(C)(C)C)[nH]1\n", + "5. CCN=c1nc(NC(C)(C)C)nc(Cl)[nH]1\n", + "6. CCNc1nc(=NC(C)(C)C)nc(Cl)[nH]1\n", + "7. CCNc1nc(Cl)[nH]c(=NC(C)(C)C)n1\n", + "8. CCNc1nc(Cl)nc(=NC(C)(C)C)[nH]1\n", + "9. CCNc1nc(Cl)nc(NC(C)(C)C)n1\n" + ] + } + ], + "source": [ + "from rdkit import Chem\n", + "from rdkit.Chem.Draw import IPythonConsole\n", + "from rdkit.Chem.MolStandardize import rdMolStandardize\n", + "\n", + "smiles = \"CCNc1nc(Cl)nc(NC(C)(C)C)n1\"\n", + "mol = Chem.MolFromSmiles(smiles)\n", + "\n", + "canonicalizer = rdMolStandardize.TautomerEnumerator()\n", + "tautomers = canonicalizer.Enumerate(mol)\n", + "Chem.Draw.MolsToGridImage(tautomers)\n", + "\n", + "\n", + "tautomers_list = []\n", + "for tautomer in tautomers:\n", + " tautomer_smiles = Chem.MolToSmiles(tautomer)\n", + " if tautomer_smiles not in tautomers:\n", + " tautomers_list.append(tautomer_smiles)\n", + "tautomers_list\n", + "\n", + "\n", + "print(f\"Original SMILES:{smiles}\")\n", + "print(f\"Number of tautomers generated: {len(tautomers_list)}\")\n", + "print(f\"Tautomers:\")\n", + "for i, tautomer in enumerate(tautomers_list):\n", + " print(f\"{i+1}. {tautomer}\")\n", + "# canonicalizer.Canonicalize(mol)\n", + "# canonical_tautomer" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smiles = \"CCNc1nc(Cl)nc(NC(C)(C)C)n1\"\n", + "mol = Chem.MolFromSmiles(smiles)\n", + "\n", + "canonicalizer = rdMolStandardize.TautomerEnumerator()\n", + "tautomers = canonicalizer.Enumerate(mol)\n", + "Chem.Draw.MolsToGridImage(tautomers)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tautomer 1: Energy = 5.94 kcal/mol\n", + "Tautomer 2: Energy = 6.60 kcal/mol\n", + "Tautomer 3: Energy = 6.37 kcal/mol\n", + "Tautomer 4: Energy = 5.67 kcal/mol\n", + "Tautomer 5: Energy = 5.13 kcal/mol\n", + "Tautomer 6: Energy = 4.91 kcal/mol\n", + "Tautomer 7: Energy = 6.30 kcal/mol\n", + "Tautomer 8: Energy = 4.99 kcal/mol\n", + "Tautomer 9: Energy = 4.69 kcal/mol\n", + "Dominant tautomer: CCNc1nc(Cl)nc(NC(C)(C)C)n1\n" + ] + } + ], + "source": [ + "from rdkit import Chem\n", + "from rdkit.Chem import AllChem\n", + "\n", + "def calculate_energy(smiles):\n", + " # Convert SMILES string to RDKit molecule\n", + " mol = Chem.MolFromSmiles(smiles)\n", + " # Add explicit hydrogens to the molecule\n", + " mol = Chem.AddHs(mol)\n", + " # Generate 3D coordinates for the molecule\n", + " AllChem.EmbedMolecule(mol)\n", + " # Calculate the total energy of the molecule using MMFF94 force field\n", + " energy = AllChem.UFFGetMoleculeForceField(mol)\n", + " energy.Minimize()\n", + " # Calculate the energy of the optimized molecule using the UFF force field\n", + " energy_ = energy.CalcEnergy()\n", + " # Return the total energy in kcal/mol\n", + " return energy_ * 0.239\n", + "\n", + "energies = []\n", + "for tautomer in tautomers_list:\n", + " # Generate 3D coordinates for the tautomer\n", + " energy = calculate_energy(tautomer)\n", + " energies.append(energy)\n", + "\n", + "# Print the relative energies of the tautomers\n", + "for i, energy_ in enumerate(energies):\n", + " print(f\"Tautomer {i+1}: Energy = {energy_:.2f} kcal/mol\")\n", + "\n", + "\n", + "# Find the index of the tautomer with the lowest energy\n", + "dominant_index = energies.index(min(energies))\n", + "# Print the dominant tautomer\n", + "print(f\"Dominant tautomer: {tautomers_list[dominant_index]}\")" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 141, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# from scipy import statsfrom\n", "import math \n", "from scipy import stats\n", "from statsmodels.formula.api import ols\n", "model1 = ols('hl_log_gmean_x ~ hl_log_gmean_y', data=df_read_across).fit(cov_type = 'HC3')\n", "\n", "slope, intercept, r_value, p_value, std_err = stats.linregress(df_read_across['hl_log_gmean_x'],df_read_across['hl_log_gmean_y'])\n", "sns.lmplot(data=df_read_across, x=\"hl_log_gmean_x\", y=\"hl_log_gmean_y\", ci=95, line_kws={'label':\"y={0:.2f}x+{1:.2f}\".format(slope, intercept)}).set(xlabel='log(DT$_{50, gmean, sludge}$)', ylabel='log(DT$_{50, gmean, soil}$)')\n", "plt.annotate('R$^2$: {}'.format(round(model1.rsquared_adj,2)), (-1.2, 2.4))\n", "plt.legend()" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from scipy import stats\n", "\n", "model1 = ols('hl_log_gmean_x ~ hl_log_gmean_y_calculated', data=df_read_across).fit(cov_type = 'HC3')\n", "\n", "MSE = np.square(np.subtract(df_read_across['hl_log_gmean_x'], df_read_across['hl_log_gmean_y_calculated'])).mean() \n", "rmse = math.sqrt(MSE) \n", "\n", "slope, intercept, r_value, p_value, std_err = stats.linregress(df_read_across['hl_log_gmean_x'],df_read_across['hl_log_gmean_y_calculated'])\n", "sns.lmplot(data=df_read_across, x=\"hl_log_gmean_x\", y=\"hl_log_gmean_y_calculated\", ci=95, line_kws={'label':\"y={0:.2f}x+{1:.2f}\".format(slope, intercept)}).set(xlabel='log(DT$_{50, gmean, sludge}$)', ylabel='calculated log(DT$_{50, gmean, soil}$)')\n", "plt.annotate('R$^2$: {}'.format(round(model1.rsquared_adj,2)), (-1, 4))\n", "plt.annotate('RMSE: {}'.format(round(rmse, 2)), (-1,3.7))\n", "plt.legend()" ] }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 166, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 116, + "execution_count": 166, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from scipy import stats\n", "model1 = ols('hl_log_bayesian_mean_x ~ hl_log_bayesian_mean_y', data=df_read_across).fit(cov_type = 'HC3')\n", "slope, intercept, r_value, p_value, std_err = stats.linregress(df_read_across['hl_log_bayesian_mean_x'],df_read_across['hl_log_bayesian_mean_y'])\n", "sns.lmplot(data=df_read_across, x=\"hl_log_bayesian_mean_x\", y=\"hl_log_bayesian_mean_y\", ci=95, line_kws={'label':\"y={0:.2f}x+{1:.2f}\".format(slope, intercept)}).set(xlabel='log(DT$_{50, bayesian\\;mean, sludge}$)', ylabel='log(DT$_{50, bayesian\\; mean, soil}$)')\n", "plt.annotate('R$^2$: {}'.format(round(model1.rsquared_adj,2)), (-2, 2.5))\n", "plt.legend()" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 147, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from scipy import stats\n", "model1 = ols('hl_log_bayesian_mean_x ~ hl_log_bayesian_mean_y_calculated', data=df_read_across).fit(cov_type = 'HC3')\n", "slope, intercept, r_value, p_value, std_err = stats.linregress(df_read_across['hl_log_bayesian_mean_x'],df_read_across['hl_log_bayesian_mean_y_calculated'])\n", "\n", "MSE = np.square(np.subtract(df_read_across['hl_log_bayesian_mean_x'], df_read_across['hl_log_bayesian_mean_y_calculated'])).mean() \n", "rmse = math.sqrt(MSE) \n", "\n", "sns.lmplot(data=df_read_across, x=\"hl_log_bayesian_mean_x\", y=\"hl_log_bayesian_mean_y_calculated\", ci=95, line_kws={'label':\"y={0:.2f}x+{1:.2f}\".format(slope, intercept)}).set(xlabel='log(DT$_{50, bayesian mean, sludge}$)', ylabel='calculated log(DT$_{50, bayesian mean, soil}$)')\n", "plt.annotate('R$^2$: {}'.format(round(model1.rsquared_adj,2)), (-2, 5))\n", "plt.annotate('RMSE: {}'.format(round(rmse, 2)), (-2,4.6))\n", "plt.legend()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.13 ('envipath')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "2347b11aabc7676a0034671a85ea3c0a49e970a2c62f233a06be25b5abf82f7b" } } }, "nbformat": 4, "nbformat_minor": 2 } diff --git a/output/bay_area/extract_all_compounds_.py b/output/bay_area/extract_all_compounds_.py new file mode 100644 index 0000000..a1e8e6e --- /dev/null +++ b/output/bay_area/extract_all_compounds_.py @@ -0,0 +1,669 @@ +import sys +import numpy as np +import pandas as pd +# pd.set_option('display.max_columns', None) +sys.path.insert(0, "C:\\Users\\leetseng\\enviPath-python\\enviPath_python\\") #previous address C:\envipath_code\Users\leetseng\enviPath-python C:\\Users\\leetseng\\enviPath-python\\enviPath_python\\ +sys.path.insert(1, "C:\\Users\\leetseng\\enviPath-python\\") +from enviPath import * +from objects import * +import rdkit +from rdkit import Chem +from rdkit.Chem.MolStandardize import rdMolStandardize +from rdkit.Chem import Descriptors +from rdkit.Chem.rdMolDescriptors import CalcMolFormula + +file_location = "C:\\Users\\leetseng\\TWtest" +# input_file_path = file_location+'input/sludge_compounds_final.txt' +output_file_path_full = file_location+'\\output\\sludge_Rich_raw.tsv' + +# Define the instance to use +INSTANCE_HOST = 'https://envipath.org' +USERNAME = 'leetseng' +PASSWORD = getpass.getpass(prompt = "Enter your password:") + +eP = enviPath(INSTANCE_HOST) +eP.login(USERNAME, PASSWORD) #getpass.getpass() +# eP.logout() +print(eP.who_am_i().get_name()) +package_id = 'https://envipath.org/packag3e/8dd7ca2-ae4e-4779-a6a2-d3539237c439' +#Leo Sludge 'https://envipath.org/package/195bc500-f0c6-4bcb-b2fe-f1602b5f20a2' +#Rich Sludge 'https://envipath.org/package/8d3d7ca2-ae4e-4779-a6a2-d3539237c439' +#Original Sludge 'https://envipath.org/package/4a3cd0f4-4d2b-4f00-b3e6-a29e721f7038' +package = Package(eP.requester, id=package_id) +all_scenarios = package.get_scenarios() +all_pathways = package.get_pathways() +print('Number of scenarios found:', len(all_scenarios)) +print('Number of pathways found:', len(all_pathways)) + + +def __main__(): + D = { + + "scenario_id": [], "compound_id": [], "compound_name": [], "smiles": [], "reduced_smiles": [], + "halflife_raw": [], "halflife_unit": [], "halflife_model_TF": [], "halflife_comment": [], + "rateconstant": [], "rateconstant_unit": [], "rateconstant_comment": [], "halflife_model": [], + "acidity": [], "acidity_unit": [], + "temperature": [], "temperature_unit": [], + "original_sludge_amount": [], "original_sludge_amount_unit": [], + "sludge_retention_time": [], "sludge_retention_time_unit": [], "sludge_retention_time_type": [], + "total_suspended_solids_concentration_start": [], "total_suspended_solids_concentration_end": [], "total_suspended_solids_concentration_unit": [], + # "volatile_suspended_solids_concentration_start": [], "volatile_suspended_solids_concentration_end": [], "volatile_suspended_solids_concentration_unit": [], + "addition_of_nutrients": [], "biological_treatment_technology": [], + "bioreactor_type": [], "bioreactor_value": [], "bioreactor_value_unit": [], + "nitrogen_content_type": [], "nitrogen_content_influent": [], + "oxygen_demand_type": [], "oxygen_demand_value": [], + "oxygen_uptake_rate": [], "oxygen_uptake_rate_unit": [], + "phosphorus_content": [], + "redox": [], + "source_of_liquid_matrix": [], + "type_of_addition": [], + "type_of_aeration": [], + "inoculum_source": [], + "location": [], + "purpose_of_wwtp": [], + } + #testing + # scenario_id = '' + # test_scenario = Scenario(eP.requester, id=scenario_id) + # add_info = test_scenario.get_additional_information() + # rateconstant_test = add_info.get_rateconstant() + # + # print(rateconstant_test) + + i = 0 + for pathway in all_pathways: + for node in pathway.get_nodes(): + print(node.get_id()) #this is not compound_id, it should be the Node id. + i += 1 + print("checking node # ", i) + try: + scenarios = node.get_scenarios() + except: + continue + else: + for scenario in scenarios: + D = add_scenario_information(D, scenario, node) + sludge_df = pd.DataFrame.from_dict(D) + sludge_df.index = np.arange(1, len(sludge_df) + 1) + sludge_df.index.name = 'index' + print(sludge_df.describe()) + sludge_df.to_csv(output_file_path_full, mode='w', sep="\t") #a, w, r + + + + +def add_scenario_information(D, scenario, node): + full_scenario = Scenario(eP.requester, id=scenario.get_id()) + add_info = full_scenario.get_additional_information() + try: + halflife_object = add_info.get_halflife() + has_hf = True + except AttributeError: + has_hf = False + try: + rateconstant_object = add_info.get_rateconstant() + has_rateconstant = True + except AttributeError: + has_rateconstant = False + if has_hf or has_rateconstant: ### + D['compound_id'].append(arcessere_compound_id(node)) + D['compound_name'].append(arcessere_compound_name(node)) + D['smiles'].append(arcessere_smiles(node)) + D['reduced_smiles'].append(canonicalize_smiles(arcessere_smiles(node))) + # D['reduced_smiles'].append(reduced_smiles) # cropped_canonical_smiles_no_stereo + + print(full_scenario.get_id()) + D['acidity'].append(arcessere_acidity(add_info)) + D['acidity_unit'].append(arcessere_acidity_unit(add_info)) + D['addition_of_nutrients'].append(arcessere_addition_of_nutrients(add_info)) + D['biological_treatment_technology'].append(arcessere_biological_treatment_technology(add_info)) + D['bioreactor_type'].append(arcessere_bioreactor_type(add_info)) + D['bioreactor_value'].append(arcessere_bioreactor_value(add_info)) + D['bioreactor_value_unit'].append(arcessere_bioreactor_value_unit(add_info)) + # D['confidencelevel'].append(arcessere_confidencelevel(add_info)) + D['halflife_raw'].append(arcessere_halflife(add_info)) + D['halflife_unit'].append(arcessere_halflife_unit(add_info)) + D['halflife_model_TF'].append(arcessere_halflife_model(add_info)) + D['halflife_comment'].append(arcessere_halflife_comment(add_info)) + D['inoculum_source'].append(arcessere_inoculum_source(add_info)) + D['location'].append(arcessere_location(add_info)) + D['nitrogen_content_type'].append(arcessere_nitrogen_content_type(add_info)) + D['nitrogen_content_influent'].append(arcessere_nitrogen_content_influent(add_info)) + D['original_sludge_amount'].append(arcessere_original_sludge_amount(add_info)) + D['original_sludge_amount_unit'].append(arcessere_original_sludge_amount_unit(add_info)) + D['oxygen_demand_type'].append(arcessere_oxygen_demand_type(add_info)) + D['oxygen_demand_value'].append(arcessere_oxygen_demand_value(add_info)) + D['oxygen_uptake_rate_unit'].append(arcessere_oxygen_uptake_rate_unit(add_info)) + D['oxygen_uptake_rate'].append(arcessere_oxygen_uptake_rate(add_info)) + D['phosphorus_content'].append(arcessere_phosphorus_content(add_info)) + D['purpose_of_wwtp'].append(arcessere_purpose_of_wwtp(add_info)) + D['rateconstant'].append(arcessere_rate_constant(add_info)) + D['rateconstant_unit'].append(arcessere_rate_constant_unit(add_info)) + D['halflife_model'].append(arcessere_reaction_order(add_info)) + D['rateconstant_comment'].append(arcessere_rate_constant_comment(add_info)) + D['redox'].append(arcessere_redox(add_info)) + D['scenario_id'].append(scenario.get_id()) + D['sludge_retention_time_type'].append(arcessere_sludge_retention_time_type(add_info)) + D['sludge_retention_time'].append(arcessere_sludge_retention_time(add_info)) + D['sludge_retention_time_unit'].append(arcessere_sludge_retention_time_unit(add_info)) + D['source_of_liquid_matrix'].append(arcessere_source_of_liquid_matrix(add_info)) + D['temperature'].append(arcessere_temperature(add_info)) + D['temperature_unit'].append(arcessere_temperature_unit(add_info)) + D['total_suspended_solids_concentration_start'].append(arcessere_tss_start(add_info)) + D['total_suspended_solids_concentration_end'].append(arcessere_tss_end(add_info)) + D['total_suspended_solids_concentration_unit'].append(arcessere_tss_unit(add_info)) + D['type_of_addition'].append(arcessere_type_of_addition(add_info)) + D['type_of_aeration'].append(arcessere_type_of_aeration(add_info)) + # D['volatile_suspended_solids_concentration_start'].append(arcessere_volatile_ss_start(add_info)) + # D['volatile_suspended_solids_concentration_end'].append(arcessere_volatile_ss_end(add_info)) + # D['volatile_suspended_solids_concentration_unit'].append(arcessere_volatile_ss_unit(add_info)) + return D + + +def arcessere_compound_id(node): + try: + id_from_node = node.get_default_structure().get_id() + except: + return '' + else: + return id_from_node + +def arcessere_compound_name(node): + try: + name_from_node = node.get_default_structure().get_name() + except ValueError: + return '' + else: + return name_from_node #.split(',')[0] only pick up one compound name + +def arcessere_smiles(node): + try: + smiles_from_node = node.get_default_structure().get_smiles() + except: + return '' + else: + return smiles_from_node + +def canonicalize_smiles(smiles_from_node): + mol = Chem.MolFromSmiles(smiles_from_node) # creates mol object from SMILES + uncharger = rdMolStandardize.Uncharger() # easier to access + uncharged = uncharger.uncharge(mol) # protonates or deprotonates the mol object + new_smiles = rdkit.Chem.rdmolfiles.MolToSmiles(uncharged) # converts mol object to canonical SMILES + can_smiles = Chem.CanonSmiles(new_smiles) + return can_smiles + +def arcessere_acidity(add_info): + try: + raw_pH = add_info.get_acidity().get_value() + except: + return np.NaN + else: + if ';' in raw_pH: + if ' - ' in raw_pH: + pH = range_to_average(raw_pH.split(';')[0]) + else: + pH = float(raw_pH.split(';')[0]) + return np.round(pH, 1) + +def arcessere_acidity_unit(add_info): + try: + pH_unit = add_info.get_acidity().get_unit() + except: + return '' + else: + return pH_unit + +def arcessere_addition_of_nutrients(add_info): + try: + addition_of_nutrients = add_info.get_additionofnutrients().get_value() + except: + return '' + else: + return addition_of_nutrients + +def arcessere_biological_treatment_technology(add_info): + try: + biological_treatment_technology = add_info.get_biologicaltreatmenttechnology().get_value() + except: + return '' + else: + return biological_treatment_technology + +def arcessere_bioreactor_type(add_info): ######################################### + try: + bioreactor_type = add_info.get_bioreactor().get_value().split(',')[0] + except: + return '' + else: + return bioreactor_type + + +def arcessere_bioreactor_value(add_info): + try: + bioreactor = float(add_info.get_bioreactor().get_value().split(',')[1]) + except ValueError: + return np.NaN + except: + return np.NaN + else: + return bioreactor + +def arcessere_bioreactor_value_unit(add_info): + try: + bioreactor_unit = add_info.get_bioreactor().get_unit() + except: + return '' + else: + return bioreactor_unit + +def arcessere_confidencelevel(add_info): + try: + confidencelevel = add_info.get_confidencelevel().get_value() + except: + return np.NaN + else: + return float(confidencelevel) + +def arcessere_inoculum_source(add_info): + try: + inoculumsource = add_info.get_inoculumsource().get_value() + except: + return '' + else: + return inoculumsource + +def arcessere_location(add_info): + try: + location = add_info.get_location().get_value() + except: + return '' + else: + return location + +def arcessere_minormajor(add_info): + try: + minormajor = add_info.get_minormajor().get_value() + except: + return '' + else: + return minormajor + +def arcessere_nitrogen_content_type(add_info): + try: + nitrogencontent = add_info.get_nitrogencontent().get_value() + except: + return '' + else: + if '₂' in nitrogencontent.split(';')[0]: + return nitrogencontent.split(';')[0].replace('₂', '\u2082') + elif '₃' in nitrogencontent.split(';')[0]: + return nitrogencontent.split(';')[0].replace('₃', '\u2083') + elif '₄' in nitrogencontent.split(';')[0]: + return nitrogencontent.split(';')[0].replace('₄', '\u2084') + + +def arcessere_nitrogen_content_influent(add_info): + try: + nitrogencontent = add_info.get_nitrogencontent().get_value() + except: + return np.NaN + else: + return float(nitrogencontent.split(';')[1]) + +def arcessere_original_sludge_amount(add_info): + try: + originalsludgeamount = add_info.get_originalsludgeamount().get_value() + except: + return np.NaN + else: + return originalsludgeamount + +def arcessere_original_sludge_amount_unit(add_info): + try: + originalsludgeamount_unit = add_info.get_originalsludgeamount().get_unit() + except: + return '' + else: + return originalsludgeamount_unit + +def arcessere_oxygen_demand_type(add_info): #checking scenario 298 + try: + oxygendemand = add_info.get_oxygendemand().get_value() + except: + return '' + else: + return oxygendemand.split(';')[0] #return chemical oxygen demand or biological oxygen demand + +def arcessere_oxygen_demand_value(add_info): + try: + oxygendemand = add_info.get_oxygendemand().get_value() + except: + return np.NaN + else: + return float(oxygendemand.split(';')[1]) + +def arcessere_oxygen_uptake_rate(add_info): + try: + our = add_info.get_oxygenuptakerate.get_value() + except: + return np.NaN + else: + return range_to_average(our) +def arcessere_oxygen_uptake_rate_unit(add_info): + try: + sludgeretentiontime_unit = add_info.get_oxygenuptakerate().get_unit() + except: + return '' + else: + # return sludgeretentiontime_unit + if "⁻¹" in sludgeretentiontime_unit.split(' ')[1] and "⁻¹" in sludgeretentiontime_unit.split(' ')[2]: + return sludgeretentiontime_unit.split(' ')[0] + '/(L * h)' + +def arcessere_phosphorus_content(add_info): + try: + phosphoruscontent = add_info.get_phosphoruscontent().get_value() + except: + return np.NaN + else: + return phosphoruscontent.split(';')[0] + +def arcessere_proposed_intermediate(add_info): + try: + proposedintermediate = add_info.get_proposedintermediate().get_value() + except: + return '' + else: + return proposedintermediate + +def arcessere_purpose_of_wwtp(add_info): + try: + purposeofwwtp = add_info.get_purposeofwwtp().get_value() + except: + return '' + else: + return purposeofwwtp + +def arcessere_redox(add_info): + try: + redox = add_info.get_redox().get_value() + except: + return '' + else: + return redox + +def arcessere_sludge_retention_time_type(add_info): + try: + sludgeretentiontime = add_info.get_sludgeretentiontime().get_value() + except: + return '' + else: + return sludgeretentiontime.split(';')[0] + +def arcessere_sludge_retention_time(add_info): + try: + sludgeretentiontime = add_info.get_sludgeretentiontime().get_value() + except: + return np.NaN + else: + return float(sludgeretentiontime.split(';')[1]) + +def arcessere_sludge_retention_time_unit(add_info): + try: + sludgeretentiontime_unit = add_info.get_sludgeretentiontime().get_unit() + except: + return '' + else: + return sludgeretentiontime_unit + +def arcessere_source_of_liquid_matrix(add_info): + try: + sourceofliquidmatrix = add_info.get_sourceofliquidmatrix().get_value() + except: + return '' + else: + return sourceofliquidmatrix +def arcessere_tss_start(add_info): + try: + tts = add_info.get_tts().get_value() + except: + return np.NaN #not sure if this return type is correct or not + else: + return float(tts.split(' _ ')[0].split(' - ')[0]) + +def arcessere_tss_end(add_info): + try: + tts = add_info.get_tts().get_value() + except: + return np.NaN #not sure if this return type is correct or not + else: + return float(tts.split(' _ ')[0].split(' - ')[1]) + +def arcessere_tss_unit(add_info): + try: + tts_unit = add_info.get_tts().get_unit() + except: + return '' + else: + return tts_unit + +def arcessere_type_of_addition(add_info): + try: + typeofaddition = add_info.get_typeofaddition().get_value() + except: + return '' + else: + return typeofaddition +def arcessere_type_of_aeration(add_info): + try: + typeofaeration = add_info.get_typeofaeration().get_value() + except: + return '' + else: + return typeofaeration + +def arcessere_volatile_ss_start(add_info): + try: + vss_start = add_info.get_volatiletts().get_value() + except: + return np.NaN #not sure if this return type is correct or not + else: + if ' - ' in vss_start: + return float(vss_start.split(' - ')[0]) + else: + return float(vss_start) + +def arcessere_volatile_ss_end(add_info): + try: + vss_end = add_info.get_volatiletts().get_value() + except: + return np.NaN #not sure if this return type is correct or not + else: + if ' - ' in vss_end: + return float(vss_end.split(' - ')[1]) + else: + return float(vss_end) +def arcessere_volatile_ss_unit(add_info): + try: + vss_unit = add_info.get_volatiletts().get_unit() + except: + return '' + else: + return vss_unit + +def range_to_average(input_string): + if '-' in input_string: + min = float(input_string.split(' - ')[0]) + max = float(input_string.split(' - ')[1]) + average = np.average([min, max]) + elif ';' in input_string: + min = float(input_string.split(';')[0]) + max = float(input_string.split(';')[1]) + average = np.average([min, max]) + else: + average = input_string + return average + +def arcessere_rate_constant(add_info): + try: + rate_constant = add_info.get_rateconstant().get_value() + except: + return np.NaN + else: + # return float(rate_constant.split(';')[2].split(' - ')[0]) + min = rate_constant.split(';')[2].split(' - ')[0] + max = rate_constant.split(';')[2].split(' - ')[1] + if min != 'NaN' and max != 'NaN': + average = np.average([float(min), float(max)]) + elif min != 'NaN' and max == 'NaN': + average = float(min) + elif min == 'NaN' and max != 'NaN': + average = float(max) + return average + +def arcessere_rate_constant_unit(add_info): + try: + rate_constant_unit = add_info.get_rateconstant().get_unit() + except: + return '' + else: + if 'μg' in rate_constant_unit: + return rate_constant_unit.replace('μg', '\u338D') + else: + return rate_constant_unit + +def arcessere_reaction_order(add_info): + try: + rate_constant = add_info.get_rateconstant().get_value() + except: + return '' + else: + return rate_constant.split(';')[0] + +def arcessere_rate_constant_comment(add_info): + try: + rate_constant_comment = add_info.get_rateconstant().get_value().split(';')[3] + except: + return '' + else: + return rate_constant_comment + +def arcessere_halflife(add_info): + try: + hf = add_info.get_halflife().get_value() + except: + return np.NaN + else: + # return float(hf.split(';')[3].split(' - ')[0]) + min = float(hf.split(';')[3].split(' - ')[0]) + max = float(hf.split(';')[3].split(' - ')[1]) + average = np.average([min, max]) + return average + + +def arcessere_halflife_unit(add_info): + try: + hf_unit = add_info.get_halflife().get_unit() + except: + return '' + else: + return hf_unit + +def arcessere_halflife_model(add_info): + try: + hl = add_info.get_halflife().get_value() + except: + return '' + else: + return hl.split(';')[0] + +def arcessere_halflife_comment(add_info): + try: + hl = add_info.get_halflife().get_value() + except: + return '' + else: + return hl.split(';')[2] + +def arcessere_halflife_source(add_info): + try: + hl = add_info.get_halflife().get_value() + except: + return '' + else: + return hl.split(';')[4] + +def arcessere_temperature(add_info): + try: + temp = add_info.get_temperature().get_value() + except: + return np.NaN + else: + min = float(temp.split(';')[0]) + max = float(temp.split(';')[1]) + return np.round(np.average([min, max]), 0) + +def arcessere_temperature_unit(add_info): + try: + temp_unit = add_info.get_temperature().get_unit() + except: + return '' + else: + return '\u2103' +#Do not iterate all the scenarios at once. +# scenario_datatypes = set() +# for scenario in all_scenarios[350:494]: +# scen = Scenario(eP.requester, id=scenario.get_id()) +# addinfo = scen.get_additional_information() +# if addinfo: +# for dt in addinfo.get_data_types(): +# scenario_datatypes.add(dt) +# print(scenario_atatypes) +# +# "Dissolvedoxygenconcentrationaerationtype": "No attribute", +# "finalcompoundconcentration": "No attribute", +# "solventforcompoundsolution": "No attribute", +# "dissolvedorganiccarbonamionauptakerate": 0, +# "typeofaerationoxygenuptakerate": 0, +# +# scenario_data_types_count = { +# "acidity": 439, +# "additionofnutrients": 38, +# "biologicaltreatmenttechnology": 356, +# "bioreactor_type": 491, +# "bioreactor_value": 491, +# "confidencelevel": 164, +# "halflife": 15, +# "halflife_model": 15, +# "halflife_comment": 15, +# "inoculumsource": 449, +# "location": 473, +# "minormajor": 18, +# "nitrogencontent_type": 105, +# "nitrogencontent_influent": 105, +# "originalsludgeamount": 408, +# "oxygendemand": 99, +# "phosphoruscontent": 88, +# "proposedintermediate": 18, +# "purposeofwwtp": 475, +# "rateconstant": 218, +# "rateconstant_comment": 218, +# "reaction_order": 218, +# "redox": 482, +# "sludgeretentiontime": 381, +# "sludgeretentiontimetype": 381, +# "sourceofliquidmatrix": 403, +# "temperature": 403, +# "tts_start": 398, +# "tts_end": 398, +# "typeofaddition": 426, +# "typeofaeration": 470, +# "volatiletts_start": 15, +# "volatiletts_end": 15, +# } +# df2 = pd.DataFrame.from_dict(scenario_data_types_count, orient='index') +# print(df2) +# sns.countplot(data=df2, x=) +# plt.show() + + +__main__() \ No newline at end of file diff --git a/output/bay_area/find_tautomers.ipynb b/output/bay_area/find_tautomers.ipynb new file mode 100644 index 0000000..835418d --- /dev/null +++ b/output/bay_area/find_tautomers.ipynb @@ -0,0 +1,38 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.13 ('envipath')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "2347b11aabc7676a0034671a85ea3c0a49e970a2c62f233a06be25b5abf82f7b" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/output/bay_area/prepare_descriptors.ipynb b/output/bay_area/prepare_descriptors.ipynb index 4f8c7a1..c475138 100644 --- a/output/bay_area/prepare_descriptors.ipynb +++ b/output/bay_area/prepare_descriptors.ipynb @@ -1,2715 +1,2768 @@ { "cells": [ { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from scipy.stats import gmean\n", "pd.options.display.max_columns = None " ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# input_file_path = \"C:\\\\Users\\\\leetseng\\\\TWtest\\\\input\\\\\"\n", "output_path = \"C:\\\\Users\\\\leetseng\\\\TWtest\\\\output\\\\bay_area\\\\\"\n", "output_path_des = output_path + \"descriptors\\\\\"\n", "\n", "#This files already contain the bay mean and bay std.\n", "#The reduced SMILES of previous files were generated from other toolkit. This file lead to inconsistant of SMILES.\n", "file_name_soil = \"soil_model_input_full.txt\" #soil_model_input_bayes_curated_full.txt\n", "file_name_sludge = \"sludge_bay_PriorMuStd_3.tsv\" #sludge_bayesian_PriorMuStd_09.tsv\n", "df_soil_original = pd.read_csv(output_path + file_name_soil, sep='\\t')\n", "df_sludge_original = pd.read_csv(output_path + file_name_sludge, sep='\\t')\n", "\n", "#Use the copy of dataframe for the later test and manipulation\n", "df_soil_ = df_soil_original.copy()\n", "df_sludge_ = df_sludge_original.copy()\n", "df_sludge_.loc[:, 'package'] = 0 # O: sludge, 1: soil\n", "df_soil_.loc[:, 'package'] = 1" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "def index_merge_descriptor(df1, df2):\n", " df_new = pd.merge(df1, df2, how='inner', on='index')\n", " # print(df_new.shape)\n", " return df_new\n", "\n", "soil_padel_descriptor = pd.read_csv(output_path_des + 'soil_PaDEL_descriptors.txt', sep='\\t')\n", "soil_maccs_descriptor = pd.read_csv(output_path_des + 'soil_MACCS_descriptors.txt', sep='\\t') # shape = (160, 485)\n", "soil_rule_descriptor = pd.read_csv(output_path_des + 'soil_rule_descriptors_TRIG.txt', sep='\\t') # shape = (160, 192)" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [], "source": [ "# from txt file to descriptor.tsv\n", "# for soil_descriptor, from index to reduced_smiles\n", "# generate the pure descriptor\n", "\n", "def non_nan_mean(x):\n", " if x.empty: \n", " return None\n", " else:\n", " x = x.dropna()\n", " return np.mean(x)\n", "\n", "def index_to_smiles(df_target_Y, des_): # des_X: df of descriptor, Y: df_soil or df_sludge, X, should be from the same package sludge=0, soil=1\n", " target_Y_ = df_target_Y.copy()\n", " # target_Y_.loc[:, 'package'] = package_label\n", " aggs = [non_nan_mean]\n", " df_subset_XY = target_Y_[[\"reduced_smiles\", \"index\"]] # \"temperature\", \"log_hl_combined\", \"log_hl_biomass_corrected\"\n", " XY = df_subset_XY.groupby([\"reduced_smiles\"]).agg(aggs).reset_index()\n", " XY.columns = XY.columns.droplevel(1)\n", " XY['index'] = XY['index'].astype(np.int64)\n", "\n", " XY_merge_des = pd.merge(XY, des_, how='left', on='index')\n", " # XY_merge_des.dropna(axis=0, inplace=True)\n", " XY_merge_des.drop(columns=['index'], inplace=True)\n", " # XY_merge_des.drop(columns=['Unnamed: 0','topoShape'], inplace=True)\n", " return XY_merge_des\n", "\n", "df_soil_all = pd.read_csv(output_path+\"soil_model_input_full.txt\", sep='\\t')\n", "\n", "soil_padel_descriptor = pd.read_csv(output_path_des + 'soil_PaDEL_descriptors.txt', sep='\\t')\n", "soil_maccs_descriptor = pd.read_csv(output_path_des + 'soil_MACCS_descriptors.txt', sep='\\t') # shape = (160, 485)\n", "soil_rule_descriptor = pd.read_csv(output_path_des + 'soil_rule_descriptors_TRIG.txt', sep='\\t') # shape = (160, 192)\n", "\n", "\n", "\n", "soil_padel_descriptor = index_to_smiles(df_soil_all, soil_padel_descriptor)\n", "# soil_maccs_descriptor = index_to_smiles(df_soil_all, soil_maccs_descriptor)\n", "soil_rule_descriptor = index_to_smiles(df_soil_all, soil_rule_descriptor)\n", "\n", "soil_padel_descriptor.to_csv(output_path_des + \"soil_padel_descriptor.tsv\", sep='\\t', index=False)\n", "soil_maccs_descriptor.to_csv(output_path_des + \"soil_maccs_descriptor.tsv\", sep='\\t', index=False)\n", "soil_rule_descriptor.to_csv(output_path_des + \"soil_rule_descriptor.tsv\", sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexstruct-1struct-2struct-3struct-4struct-5struct-6struct-7struct-8struct-9struct-10struct-11struct-12struct-13struct-14struct-15struct-16struct-17struct-18struct-19struct-20struct-21struct-22struct-23struct-24struct-25struct-26struct-27struct-28struct-29struct-30struct-31struct-32struct-33struct-34struct-35struct-36struct-37struct-38struct-39struct-40struct-41struct-42struct-43struct-44struct-45struct-46struct-47struct-48struct-49struct-50struct-51struct-52struct-53struct-54struct-55struct-56struct-57struct-58struct-59struct-60struct-61struct-62struct-63struct-64struct-65struct-66struct-67struct-68struct-69struct-70struct-71struct-72struct-73struct-74struct-75struct-76struct-77struct-78struct-79struct-80struct-81struct-82struct-83struct-84struct-85struct-86struct-87struct-88struct-89struct-90struct-91struct-92struct-93struct-94struct-95struct-96struct-97struct-98struct-99struct-100struct-101struct-102struct-103struct-104struct-105struct-106struct-107struct-108struct-109struct-110struct-111struct-112struct-113struct-114struct-115struct-116struct-117struct-118struct-119struct-120struct-121struct-122struct-123struct-124struct-125struct-126struct-127struct-128struct-129struct-130struct-131struct-132struct-133struct-134struct-135struct-136struct-137struct-138struct-139struct-140struct-141struct-142struct-143struct-144struct-145struct-146struct-147struct-148struct-149struct-150struct-151struct-152struct-153struct-154struct-155struct-156struct-157struct-158struct-159struct-160struct-161struct-162struct-163struct-164struct-165struct-166
040000000000000000000000000000000000000000000000000001000000000000100010000000101100100000000001010000000000000000000000011001000000000000100001000000001000000000110010
150000000000000000000000000000000000000000000000000001000000000000100000000010101100100000000001111001000100000010000000011111000000010000101001000001000011111110110110
260000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010001000000000000000000000000000000100000000000000000010100000000000
3110000000000000000000000000000000000001000000000000000000000000000000000000100100001001100000010000000000001001100001010000100000000101010001011000001101011111111111110
4150000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100010000000000000000010100000000000001000001000000001001010011110

88925930000000000000000000000000000000000000011000000010010101001011000001010001000000000000001110100001000010101010111010010000001000001111011001100010101111011110111111110
89025990000000000000000000000000001000000000000000000000000000000000000000000010000000000001100101100100001000000011100010110000110010100001011011110010101110011111111111110
89126000000000000000000000000000001000000000000000000000010000000011000001000001000000000001101101100001001010100011110010110000111010110011011011110010101110011111111111110
89226010000000000000000000000000001000000000011000000010010001001011000001010001000000000001101100100001001010101011111010110000101010111011011011110010101110011111111111110
89326050000000000000000000000000000000000000000000000000000000010000100000000000010000000001100000000100101100000010010001001011100000110011000110000000011100110011101101110
\n", "

894 rows × 167 columns

\n", "
" ], "text/plain": [ " index struct-1 struct-2 struct-3 struct-4 struct-5 struct-6 \\\n", "0 4 0 0 0 0 0 0 \n", "1 5 0 0 0 0 0 0 \n", "2 6 0 0 0 0 0 0 \n", "3 11 0 0 0 0 0 0 \n", "4 15 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... ... \n", "889 2593 0 0 0 0 0 0 \n", "890 2599 0 0 0 0 0 0 \n", "891 2600 0 0 0 0 0 0 \n", "892 2601 0 0 0 0 0 0 \n", "893 2605 0 0 0 0 0 0 \n", "\n", " struct-7 struct-8 struct-9 struct-10 struct-11 struct-12 struct-13 \\\n", "0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... ... \n", "889 0 0 0 0 0 0 0 \n", "890 0 0 0 0 0 0 0 \n", "891 0 0 0 0 0 0 0 \n", "892 0 0 0 0 0 0 0 \n", "893 0 0 0 0 0 0 0 \n", "\n", " struct-14 struct-15 struct-16 struct-17 struct-18 struct-19 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 0 0 0 0 \n", "890 0 0 0 0 0 0 \n", "891 0 0 0 0 0 0 \n", "892 0 0 0 0 0 0 \n", "893 0 0 0 0 0 0 \n", "\n", " struct-20 struct-21 struct-22 struct-23 struct-24 struct-25 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 0 0 0 0 \n", "890 0 0 0 0 0 0 \n", "891 0 0 0 0 0 0 \n", "892 0 0 0 0 0 0 \n", "893 0 0 0 0 0 0 \n", "\n", " struct-26 struct-27 struct-28 struct-29 struct-30 struct-31 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 0 0 0 0 \n", "890 0 0 1 0 0 0 \n", "891 0 0 1 0 0 0 \n", "892 0 0 1 0 0 0 \n", "893 0 0 0 0 0 0 \n", "\n", " struct-32 struct-33 struct-34 struct-35 struct-36 struct-37 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 1 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 0 0 0 0 \n", "890 0 0 0 0 0 0 \n", "891 0 0 0 0 0 0 \n", "892 0 0 0 0 0 0 \n", "893 0 0 0 0 0 0 \n", "\n", " struct-38 struct-39 struct-40 struct-41 struct-42 struct-43 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 1 1 0 0 0 \n", "890 0 0 0 0 0 0 \n", "891 0 0 0 0 0 0 \n", "892 0 1 1 0 0 0 \n", "893 0 0 0 0 0 0 \n", "\n", " struct-44 struct-45 struct-46 struct-47 struct-48 struct-49 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 0 0 1 0 \n", "890 0 0 0 0 0 0 \n", "891 0 0 0 0 0 0 \n", "892 0 0 0 0 1 0 \n", "893 0 0 0 0 0 0 \n", "\n", " struct-50 struct-51 struct-52 struct-53 struct-54 struct-55 \\\n", "0 0 0 1 0 0 0 \n", "1 0 0 1 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 1 0 1 0 1 \n", "890 0 0 0 0 0 0 \n", "891 0 1 0 0 0 0 \n", "892 0 1 0 0 0 1 \n", "893 0 0 0 0 0 0 \n", "\n", " struct-56 struct-57 struct-58 struct-59 struct-60 struct-61 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 1 0 1 1 \n", "890 0 0 0 0 0 0 \n", "891 0 0 0 0 1 1 \n", "892 0 0 1 0 1 1 \n", "893 0 1 0 0 0 0 \n", "\n", " struct-62 struct-63 struct-64 struct-65 struct-66 struct-67 \\\n", "0 0 0 0 1 0 0 \n", "1 0 0 0 1 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 0 0 0 1 \n", "890 0 0 0 0 0 0 \n", "891 0 0 0 0 0 1 \n", "892 0 0 0 0 0 1 \n", "893 1 0 0 0 0 0 \n", "\n", " struct-68 struct-69 struct-70 struct-71 struct-72 struct-73 \\\n", "0 0 1 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 1 0 0 0 1 \n", "890 0 0 0 0 1 0 \n", "891 0 0 0 0 0 1 \n", "892 0 1 0 0 0 1 \n", "893 0 0 0 0 0 0 \n", "\n", " struct-74 struct-75 struct-76 struct-77 struct-78 struct-79 \\\n", "0 0 0 0 1 0 1 \n", "1 0 1 0 1 0 1 \n", "2 0 0 0 0 0 0 \n", "3 1 0 0 1 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 0 0 0 0 \n", "890 0 0 0 0 0 0 \n", "891 0 0 0 0 0 0 \n", "892 0 0 0 0 0 0 \n", "893 0 1 0 0 0 0 \n", "\n", " struct-80 struct-81 struct-82 struct-83 struct-84 struct-85 \\\n", "0 1 0 0 1 0 0 \n", "1 1 0 0 1 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 1 0 0 1 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 0 0 0 0 \n", "890 0 0 0 0 0 1 \n", "891 0 0 0 0 0 1 \n", "892 0 0 0 0 0 1 \n", "893 0 0 0 0 0 1 \n", "\n", " struct-86 struct-87 struct-88 struct-89 struct-90 struct-91 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 1 0 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 1 1 1 0 \n", "890 1 0 0 1 0 1 \n", "891 1 0 1 1 0 1 \n", "892 1 0 1 1 0 0 \n", "893 1 0 0 0 0 0 \n", "\n", " struct-92 struct-93 struct-94 struct-95 struct-96 struct-97 \\\n", "0 0 0 1 0 1 0 \n", "1 0 0 1 1 1 1 \n", "2 0 0 0 0 0 0 \n", "3 0 1 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 1 0 0 0 0 1 \n", "890 1 0 0 1 0 0 \n", "891 1 0 0 0 0 1 \n", "892 1 0 0 0 0 1 \n", "893 0 0 0 1 0 0 \n", "\n", " struct-98 struct-99 struct-100 struct-101 struct-102 struct-103 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 1 0 0 0 \n", "2 0 1 0 0 0 1 \n", "3 0 0 0 0 0 0 \n", "4 0 0 0 1 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 0 0 1 0 \n", "890 0 0 1 0 0 0 \n", "891 0 0 1 0 1 0 \n", "892 0 0 1 0 1 0 \n", "893 1 0 1 1 0 0 \n", "\n", " struct-104 struct-105 struct-106 struct-107 struct-108 struct-109 \\\n", "0 0 0 0 0 0 0 \n", "1 1 0 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 1 0 0 1 \n", "4 0 1 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 1 0 1 0 1 0 \n", "890 0 0 0 0 1 1 \n", "891 1 0 0 0 1 1 \n", "892 1 0 1 0 1 1 \n", "893 0 0 0 0 1 0 \n", "\n", " struct-110 struct-111 struct-112 struct-113 struct-114 struct-115 \\\n", "0 0 0 0 0 0 0 \n", "1 0 1 0 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 1 0 0 0 0 1 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 1 1 1 0 1 0 \n", "890 1 0 0 0 1 0 \n", "891 1 1 0 0 1 0 \n", "892 1 1 1 0 1 0 \n", "893 0 1 0 0 0 1 \n", "\n", " struct-116 struct-117 struct-118 struct-119 struct-120 struct-121 \\\n", "0 0 0 0 0 1 1 \n", "1 0 0 0 0 1 1 \n", "2 0 0 0 0 0 0 \n", "3 0 1 0 0 0 0 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 1 0 0 0 0 \n", "890 1 1 0 0 0 0 \n", "891 1 1 0 0 0 0 \n", "892 1 1 0 0 0 0 \n", "893 0 0 1 0 1 1 \n", "\n", " struct-122 struct-123 struct-124 struct-125 struct-126 struct-127 \\\n", "0 0 0 1 0 0 0 \n", "1 1 1 1 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 1 0 0 0 0 0 \n", "4 0 1 0 1 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 1 0 0 0 \n", "890 1 1 0 0 1 0 \n", "891 1 1 1 0 1 0 \n", "892 1 0 1 0 1 0 \n", "893 1 0 0 0 0 0 \n", "\n", " struct-128 struct-129 struct-130 struct-131 struct-132 struct-133 \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 1 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 0 1 0 1 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 0 0 1 1 1 1 \n", "890 1 0 0 0 0 1 \n", "891 1 1 0 0 1 1 \n", "892 1 1 1 0 1 1 \n", "893 1 1 0 0 1 1 \n", "\n", " struct-134 struct-135 struct-136 struct-137 struct-138 struct-139 \\\n", "0 0 0 0 1 0 0 \n", "1 0 0 0 1 0 1 \n", "2 1 0 0 0 0 0 \n", "3 0 1 0 0 0 1 \n", "4 0 0 0 0 0 1 \n", ".. ... ... ... ... ... ... \n", "889 0 1 1 0 0 1 \n", "890 0 1 1 0 1 1 \n", "891 0 1 1 0 1 1 \n", "892 0 1 1 0 1 1 \n", "893 0 0 0 1 1 0 \n", "\n", " struct-140 struct-141 struct-142 struct-143 struct-144 struct-145 \\\n", "0 0 0 1 0 0 0 \n", "1 0 0 1 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 1 1 0 0 0 \n", "4 0 0 0 0 0 1 \n", ".. ... ... ... ... ... ... \n", "889 1 0 0 0 1 0 \n", "890 1 1 0 0 1 0 \n", "891 1 1 0 0 1 0 \n", "892 1 1 0 0 1 0 \n", "893 0 0 0 0 0 0 \n", "\n", " struct-146 struct-147 struct-148 struct-149 struct-150 struct-151 \\\n", "0 0 0 0 0 0 1 \n", "1 0 0 1 0 0 0 \n", "2 0 0 0 0 0 0 \n", "3 0 0 1 1 0 1 \n", "4 0 0 0 0 0 0 \n", ".. ... ... ... ... ... ... \n", "889 1 0 1 1 1 1 \n", "890 1 0 1 1 1 0 \n", "891 1 0 1 1 1 0 \n", "892 1 0 1 1 1 0 \n", "893 0 1 1 1 0 0 \n", "\n", " struct-152 struct-153 struct-154 struct-155 struct-156 struct-157 \\\n", "0 0 0 0 0 0 0 \n", "1 0 1 1 1 1 1 \n", "2 0 1 0 1 0 0 \n", "3 0 1 1 1 1 1 \n", "4 0 0 1 0 0 1 \n", ".. ... ... ... ... ... ... \n", "889 0 1 1 1 1 0 \n", "890 0 1 1 1 1 1 \n", "891 0 1 1 1 1 1 \n", "892 0 1 1 1 1 1 \n", "893 1 1 0 0 1 1 \n", "\n", " struct-158 struct-159 struct-160 struct-161 struct-162 struct-163 \\\n", "0 0 0 0 1 1 0 \n", "1 1 1 0 1 1 0 \n", "2 0 0 0 0 0 0 \n", "3 1 1 1 1 1 1 \n", "4 0 1 0 0 1 1 \n", ".. ... ... ... ... ... ... \n", "889 1 1 1 1 1 1 \n", "890 1 1 1 1 1 1 \n", "891 1 1 1 1 1 1 \n", "892 1 1 1 1 1 1 \n", "893 1 0 1 1 0 1 \n", "\n", " struct-164 struct-165 struct-166 \n", "0 0 1 0 \n", "1 1 1 0 \n", "2 0 0 0 \n", "3 1 1 0 \n", "4 1 1 0 \n", ".. ... ... ... \n", "889 1 1 0 \n", "890 1 1 0 \n", "891 1 1 0 \n", "892 1 1 0 \n", "893 1 1 0 \n", "\n", "[894 rows x 167 columns]" ] }, "execution_count": 142, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soil_maccs_descriptor" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# prepare the mixed descriptor\n", "def merge_descriptor(df1, df2):\n", " df_new = pd.merge(df1, df2, how='left', on ='reduced_smiles')\n", " print(df_new.shape)\n", " return df_new\n", "\n", "soil_padel_maccs_descriptor = merge_descriptor(soil_padel_descriptor, soil_maccs_descriptor)\n", "soil_padel_maccs_descriptor.drop_duplicates(inplace=True)\n", "soil_padel_maccs_descriptor.to_csv(output_path_des+\"soil_padel_maccs_descriptor.tsv\", sep='\\t', index=False)\n", "\n", "\n", "soil_padel_rule_descriptor = merge_descriptor(soil_padel_descriptor, soil_rule_descriptor)\n", "soil_padel_rule_descriptor.drop_duplicates(inplace=True)\n", "soil_padel_rule_descriptor.to_csv(output_path_des+\"soil_padel_rule_descriptor.tsv\", sep='\\t', index=False)\n", "\n", "soil_maccs_rule_descriptor = merge_descriptor(soil_maccs_descriptor, soil_rule_descriptor)\n", "soil_maccs_rule_descriptor.drop_duplicates(inplace=True)\n", "soil_maccs_rule_descriptor.to_csv(output_path_des+\"soil_maccs_rule_descriptor.tsv\", sep='\\t', index=False)\n", "\n", "soil_all_descriptor = merge_descriptor(soil_padel_maccs_descriptor, soil_rule_descriptor)\n", "soil_all_descriptor.drop_duplicates(inplace=True)\n", "soil_all_descriptor.to_csv(output_path_des+\"soil_all_descriptor.tsv\", sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [], "source": [ "def non_nan_mean(x):\n", " if x.empty: \n", " return None\n", " else:\n", " x = x.dropna()\n", " return np.mean(x)\n", "\n", "def get_subset_target_variable(df_target_Y, des_X, package_label): # des_X: df of descriptor, Y: df_soil or df_sludge, X, should be from the same package sludge=0, soil=1\n", " target_Y_ = df_target_Y.copy()\n", " target_Y_.loc[:, 'package'] = package_label\n", " aggs = [non_nan_mean]\n", " df_subset_XY = target_Y_[[\"reduced_smiles\", \"hl_log_bayesian_mean\", \"package\"]] # \"temperature\", \"log_hl_combined\", \"log_hl_biomass_corrected\"\n", " XY = df_subset_XY.groupby([\"reduced_smiles\"]).agg(aggs).reset_index()\n", " XY.columns = XY.columns.droplevel(1)\n", " XY_merge_des = pd.merge(XY, des_X, how='left', on='reduced_smiles')\n", " # XY_merge_des.dropna(axis=0, inplace=True)\n", " # XY_merge_des.drop(columns=['index'], inplace=True)\n", " return XY_merge_des\n", "\n", "soil_XY_merge_padel = get_subset_target_variable(df_soil_, soil_padel_descriptor, 1)\n", "soil_XY_merge_maccs = get_subset_target_variable(df_soil_, soil_maccs_descriptor, 1)\n", "soil_XY_merge_rule = get_subset_target_variable(df_soil_, soil_rule_descriptor, 1)\n", "soil_XY_merge_padel_maccs = get_subset_target_variable(df_soil_, soil_padel_maccs_descriptor, 1)\n", "soil_XY_merge_padel_rule = get_subset_target_variable(df_soil_, soil_padel_rule_descriptor, 1)\n", "soil_XY_merge_maccs_rule = get_subset_target_variable(df_soil_, soil_maccs_rule_descriptor, 1)\n", "soil_XY_merge_all = get_subset_target_variable(df_soil_, soil_all_descriptor, 1)\n", "soil_XY_merge_padel_maccs\n", "# X_sludge_des, X_validate, y_sludge_des, y_validate = train_test_split(sludge_XY_merge_des.iloc[:, 2:], sludge_XY_merge_des['hl_log_bayesian_mean'], test_size=0.10, random_state=62)\n", "\n", "soil_XY_merge_padel.to_csv(output_path_des+\"soil_XY_merge_padel.tsv\", sep='\\t', index=False)\n", "soil_XY_merge_maccs.to_csv(output_path_des+\"soil_XY_merge_maccs.tsv\", sep='\\t', index=False)\n", "soil_XY_merge_rule.to_csv(output_path_des+\"soil_XY_merge_rule.tsv\", sep='\\t', index=False)\n", "soil_XY_merge_padel_maccs.to_csv(output_path_des+\"soil_XY_merge_padel_maccs.tsv\", sep='\\t', index=False)\n", "soil_XY_merge_padel_rule.to_csv(output_path_des+\"soil_XY_merge_padel_rule.tsv\", sep='\\t', index=False)\n", "soil_XY_merge_maccs_rule.to_csv(output_path_des+\"soil_XY_merge_maccs_rule.tsv\", sep='\\t', index=False)\n", "soil_XY_merge_all.to_csv(output_path_des+\"soil_XY_merge_all.tsv\", sep='\\t', index=False)\n" ] }, { "cell_type": "code", "execution_count": 211, "metadata": {}, "outputs": [], "source": [ "sludge_padel_descriptor = pd.read_csv(output_path_des+\"sludge_padel_descriptor.tsv\", sep='\\t')\n", "sludge_maccs_descriptor = pd.read_csv(output_path_des+\"sludge_maccs_descriptor.tsv\", sep='\\t')\n", "sludge_rule_descriptor = pd.read_csv(output_path_des+\"sludge_rule_descriptor.tsv\", sep='\\t')\n", "\n", "sludge_padel_maccs_descriptor = pd.read_csv(output_path_des+\"sludge_padel_maccs_descriptor.tsv\", sep='\\t')\n", "sludge_padel_rule_descriptor = pd.read_csv(output_path_des+\"sludge_padel_rule_descriptor.tsv\", sep='\\t')\n", "sludge_maccs_rule_descriptor = pd.read_csv(output_path_des+\"sludge_maccs_rule_descriptor.tsv\", sep='\\t')\n", "\n", "sludge_all_descriptor = pd.read_csv(output_path_des+\"sludge_all_descriptor.tsv\", sep='\\t')\n", "# sludge_all_descriptor = pd.read_csv(output_path+\"sludge_all_descriptor.tsv\", sep='\\t')\n", "# sludge_maccs_rule_descriptor.drop(columns='struct-0', inplace=True)\n", "sludge_maccs_descriptor.drop(columns=sludge_maccs_descriptor.iloc[:, 167:], inplace=True)\n", "# sludge_maccs_descriptor.to_csv(output_path_des+\"sludge_maccs_descriptor.tsv\", sep='\\t', index=False)\n", "\n", "# sludge_padel_maccs_descriptor.drop(columns='struct-0', inplace=True)\n", "# sludge_padel_maccs_descriptor.drop(columns=sludge_padel_maccs_descriptor.iloc[:, 1614:], inplace=True)\n", "# sludge_padel_maccs_descriptor.to_csv(output_path_des+\"sludge_padel_maccs_descriptor.tsv\", sep='\\t', index=False)\n", "\n", "# sludge_maccs_rule_descriptor.drop(columns='struct-0', inplace=True)\n", "# sludge_maccs_rule_descriptor.drop(columns=sludge_maccs_rule_descriptor.iloc[:, 167:193], inplace=True)\n", "# sludge_maccs_rule_descriptor.to_csv(output_path_des+\"sludge_maccs_rule_descriptor.tsv\", sep='\\t', index=False)\n", "\n", "# sludge_all_descriptor.drop(columns='struct-0', inplace=True)\n", "# sludge_all_descriptor.drop(columns=sludge_all_descriptor.iloc[:, 1611:1636], inplace=True)\n", "# sludge_all_descriptor.to_csv(output_path_des+\"sludge_all_descriptor.tsv\", sep='\\t', index=False)\n", "\n", "sludge_XY_merge_padel = get_subset_target_variable(df_sludge_, sludge_padel_descriptor, 0)\n", "sludge_XY_merge_maccs = get_subset_target_variable(df_sludge_, sludge_maccs_descriptor, 0)\n", "sludge_XY_merge_rule = get_subset_target_variable(df_sludge_, sludge_rule_descriptor, 0)\n", "\n", "sludge_XY_merge_padel_maccs = get_subset_target_variable(df_sludge_, sludge_padel_maccs_descriptor, 0)\n", "sludge_XY_merge_padel_rule = get_subset_target_variable(df_sludge_, sludge_padel_rule_descriptor, 0)\n", "sludge_XY_merge_maccs_rule = get_subset_target_variable(df_sludge_, sludge_maccs_rule_descriptor, 0)\n", "\n", "sludge_XY_merge_all = get_subset_target_variable(df_sludge_, sludge_all_descriptor, 0)\n", "\n", "sludge_XY_merge_padel.to_csv(output_path_des+\"sludge_XY_merge_padel.tsv\", sep='\\t', index=False)\n", "sludge_XY_merge_maccs.to_csv(output_path_des+\"sludge_XY_merge_maccs.tsv\", sep='\\t', index=False)\n", "sludge_XY_merge_rule.to_csv(output_path_des+\"sludge_XY_merge_rule.tsv\", sep='\\t', index=False)\n", "sludge_XY_merge_padel_maccs.to_csv(output_path_des+\"sludge_XY_merge_padel_maccs.tsv\", sep='\\t', index=False)\n", "\n", "sludge_XY_merge_padel_rule.to_csv(output_path_des+\"sludge_XY_merge_padel_rule.tsv\", sep='\\t', index=False)\n", "sludge_XY_merge_maccs_rule.to_csv(output_path_des+\"sludge_XY_merge_maccs_rule.tsv\", sep='\\t', index=False)\n", "sludge_XY_merge_all.to_csv(output_path_des+\"sludge_XY_merge_all.tsv\", sep='\\t', index=False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": 214, + "metadata": {}, + "outputs": [], + "source": [ + "sludge_vec_descriptor = pd.read_csv(output_path_des+\"sludge_vec_descriptor.tsv\", sep='\\t')\n", + "soil_vec_descriptor = pd.read_csv(output_path_des+\"soil_vec_descriptor.tsv\", sep='\\t')\n", + "sludge_XY_merge_vec = get_subset_target_variable(df_sludge_, sludge_vec_descriptor, 0)\n", + "soil_XY_merge_vec = get_subset_target_variable(df_soil_, soil_vec_descriptor, 1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "metadata": {}, + "outputs": [], + "source": [ + "sludge_XY_merge_vec.to_csv(output_path_des+\"sludge_XY_merge_vec.tsv\", sep='\\t', index=False)\n", + "soil_XY_merge_vec.to_csv(output_path_des+\"soil_XY_merge_vec.tsv\", sep='\\t', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [[ 3.75860065e-01 -5.16975820e-01 3.04236203e...\n", + "1 [[ 0.4711976 -0.55014855 0.16992049 0.34950...\n", + "2 [[ 0.36134678 -0.18154728 0.02233491 -0.01184...\n", + "3 [[-0.18096934 0.02644509 -0.239617 0.20880...\n", + "4 [[ 9.00694355e-02 -9.96635780e-02 -2.27172539e...\n", + " ... \n", + "155 [[ 0.33386835 -0.2263216 -0.19296879 0.46654...\n", + "156 [[ 3.01284581e-01 2.92636994e-02 -1.98752340e...\n", + "157 [[ 6.13241017e-01 -2.64540642e-01 -2.28865877e...\n", + "158 [[-2.25752279e-01 -7.45686233e-01 2.86516100e...\n", + "159 [[ 7.78277069e-02 5.80310643e-01 -3.66891086e...\n", + "Name: encoded_vectors, Length: 160, dtype: object" + ] + }, + "execution_count": 222, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sludge_XY_merge_vec.iloc[:, 3]" + ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.13 ('envipath')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "2347b11aabc7676a0034671a85ea3c0a49e970a2c62f233a06be25b5abf82f7b" } } }, "nbformat": 4, "nbformat_minor": 2 }