diff --git a/code/scripts/generate_plots.py b/code/scripts/generate_plots.py index 321e44d..4b16f82 100644 --- a/code/scripts/generate_plots.py +++ b/code/scripts/generate_plots.py @@ -1,30 +1,30 @@ import os import matplotlib.pyplot as plt import pandas as pd import plots.plot_categories as plot_categories import plots.plot_fossil_fuel_industry as plot_fossil_fuel_industry import plots.plot_government as plot_government import plots.plot_experience as plot_experience import plots.plot_missing_participants as plot_missing_participants import plots.plot_gender_rate as plot_gender_rate import plots.plot_delegation_sizes as plot_delegation_sizes import plots.plot_overall_experience_distr as plot_overall_experience_distr import plots.plot_delegation_exp as plot_delegation_exp import plots.plot_intervention_distr as plot_intervention_distr import plots.plot_detailed_experience as plot_detailed_experience import plots.plot_participant_graph as plot_participant_graph +plot_intervention_distr.plot("../data/data_regression/dataset_interventions.csv") plot_participant_graph.plot("../results/experience_dict_def.txt") plot_detailed_experience.plot("../results/complete_dataset_experience-def.csv") plot_experience.plot("../results/complete_dataset_experience-def.csv") plot_fossil_fuel_industry.plot("../results/complete_dataset.csv") plot_government.plot("../results/complete_dataset.csv") plot_gender_rate.plot("../results/complete_dataset.csv") plot_categories.plot("../results/participants-csv/participants_") -plot_intervention_distr.plot("../data/data_regression/dataset_interventions.csv") plot_missing_participants.plot("../results/participants-csv/participants_cop") plot_delegation_exp.plot("../results/complete_dataset_experience-def.csv") plot_overall_experience_distr.plot("../results/complete_dataset_experience-2.csv") plot_delegation_sizes.plot("../results/complete_dataset.csv") diff --git a/code/scripts/plots/plot_intervention_distr.py b/code/scripts/plots/plot_intervention_distr.py index 5630db1..56abe5f 100644 --- a/code/scripts/plots/plot_intervention_distr.py +++ b/code/scripts/plots/plot_intervention_distr.py @@ -1,32 +1,36 @@ import pandas as pd import matplotlib.pyplot as plt import math def plot(path): """ Plots the distribution of interventions per party per meeting. Args: path (str): path to the interventions_prepared.csv file """ data = pd.read_csv(path, encoding="utf-8-sig") - plt.hist((data["nb_interventions"].loc[data["nb_interventions"] > 0]).map(lambda n: math.log(n))) + plt.hist(data["nb_interventions"], bins=40) + # plt.hist((data["nb_interventions"].loc[data["nb_interventions"] > 0]).map(lambda n: math.log(n+10)).map(lambda n: math.log(n+10)), bins=40) + plt.title("Distribution of interventions") + plt.ylabel("Occurences") + plt.xlabel("Nb. of interventions") plt.show() # data["meeting_and_country"] = str(data["meeting"]) + str(data["country"]) """by_nb_interventions = data.groupby("nb_interventions") result_df = pd.DataFrame(columns={"nb_interventions", "nb_affiliations"}) for n, aff in by_nb_interventions: result_df = result_df.append({ "nb_interventions": n, "nb_affiliations": len(aff) }, ignore_index=True) plot_data = result_df.set_index("nb_interventions") plot_data.plot(kind="hist", xlabel="nb of interventions", ylabel="number of affiliations", title="Distribution of number of interventions per party and meeting", xlim=[-1000,1000]) plt.show() """ \ No newline at end of file diff --git a/code/scripts/predict_interventions.ipynb b/code/scripts/predict_interventions.ipynb index a2215ab..f52604e 100644 --- a/code/scripts/predict_interventions.ipynb +++ b/code/scripts/predict_interventions.ipynb @@ -1,1048 +1,977 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Predictive modelling of interventions\n", "## Ridge regression\n", "\n", "This notebook is ................\n", "\n", "\n", "First of all, we import the necessary packets." ] }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn import linear_model\n", "from sklearn import model_selection\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.metrics import accuracy_score\n", "\n", "# constants\n", "FIRST_COUNTRY_INDEX_WO_EXP = 12\n", "FIRST_COUNTRY_INDEX = FIRST_COUNTRY_INDEX_WO_EXP + 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Prepare the dataset\n", "The data is provided in pandas dataframes. If the necessary csv files are not available, they can be generated with the script 'prepare_intervention_data.py'. This data now needs to be converted into numpy array such that we can train our model." ] }, { "cell_type": "code", - "execution_count": 174, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9217\n", - "[0. 7. 1. 0.85714286 0.14285714 0.\n", - " 0. 0. 0. 0. 0. 0.57142857\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. 0. 0.\n", - " 0. 0. 0. 0. ]\n", "['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo Republic', 'Cook Islands', 'Costa Rica', \"Cote d'Ivoire\", 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'DR Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'Kuwait', 'Kyrgyz Republic', 'Laos', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macedonia', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius', 'Mexico', 'Micronesia, Fed. Sts.', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'North Korea', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestine', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'Rwanda', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Korea', 'South Sudan', 'Spain', 'Sri Lanka', 'St. Kitts and Nevis', 'St. Lucia', 'St. Vincent and the Grenadines', 'Sudan', 'Suriname', 'Sweden', 'Switzerland', 'Syria', 'Tajikistan', 'Tanzania', 'Thailand', 'Timor-Leste', 'Togo', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Vatican', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe']\n" ] } ], "source": [ "data = pd.read_csv(\"../data/data_regression/dataset_interventions.csv\",\n", " encoding=\"utf-8-sig\")\n", "\n", - "D = 214\n", + "D = 213\n", + "D_wo_exp = D - 3\n", "N = len(data)\n", "print(N)\n", "dataset = np.zeros((N, D), dtype=np.float64)\n", - "dataset_without_exp = np.zeros((N, D), dtype=np.float64)\n", + "dataset_without_exp = np.zeros((N, D_wo_exp), dtype=np.float64)\n", "\n", "dataset_without_exp[:,:FIRST_COUNTRY_INDEX_WO_EXP] = (data.loc[:,\"year\":\"woman_proportion\"]).to_numpy()\n", "dataset[:,:FIRST_COUNTRY_INDEX] = (data.loc[:,\"year\":\"experience score parties rate\"]).to_numpy()\n", "\n", "labelset = np.zeros((N,), dtype=np.float64)\n", "labelset[:] = (data.loc[:, \"nb_interventions\"]).to_numpy()\n", "\n", "# read the valid countries\n", "country_file = open(\"../data/dictionaries/valid_countries.txt\", \"r\")\n", "countries = country_file.readlines()\n", "countries = [c.replace(\"\\n\", \"\") for c in countries]\n", "countries = sorted(countries)\n", "print(countries)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The resting part of the dataset is the country. We need to write a function that returns the index of the country in a sorted list of the valid countries." ] }, { "cell_type": "code", - "execution_count": 198, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def get_country_index(country):\n", " if country in countries:\n", " return countries.index(country)\n", " else:\n", " # unknown country\n", " return len(countries)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To finalize the dataset, we use the defined function. for indices 12 to 209, a 1 means that the affiliation is this country, 0 means that it isn't." ] }, { "cell_type": "code", - "execution_count": 199, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "for i in range(N):\n", " country = data.iloc[i,1]\n", " dataset_without_exp[i, FIRST_COUNTRY_INDEX_WO_EXP + get_country_index(country)] = 1\n", " dataset[i, FIRST_COUNTRY_INDEX + get_country_index(country)] = 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Split the data to training data and test data\n", "In a first step, I consider the first 80% of the samples as training data and the resting 20% as test data" ] }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 13, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[23. 10. 0. ... 0. 0. 0.]\n", - " [15. 2. 1. ... 0. 0. 0.]\n", - " [14. 13. 0. ... 0. 0. 0.]\n", - " ...\n", - " [21. 11. 1. ... 0. 0. 0.]\n", - " [18. 10. 0. ... 0. 0. 0.]\n", - " [24. 6. 1. ... 0. 0. 0.]]\n", - "[[ 8. 3. 0. ... 0. 0. 0.]\n", - " [12. 3. 0. ... 0. 1. 0.]\n", - " [20. 10. 0. ... 0. 0. 0.]\n", - " ...\n", - " [ 6. 46. 0. ... 0. 0. 0.]\n", - " [20. 11. 0. ... 0. 0. 0.]\n", - " [24. 66. 0. ... 0. 0. 0.]]\n" - ] - } - ], + "outputs": [], "source": [ "# shuffle everything\n", "np.random.seed(2020)\n", "p = np.random.permutation(N)\n", "shuffled_dataset_wo_exp = dataset_without_exp[p]\n", "shuffled_dataset = dataset[p]\n", "shuffled_labelset = labelset[p]\n", "\n", "SPLIT_IDX = 7400\n", "# seperate train and test data\n", "X_train_without_exp = shuffled_dataset_wo_exp[:SPLIT_IDX]\n", "X_train = shuffled_dataset[:SPLIT_IDX]\n", "Y_train = shuffled_labelset[:SPLIT_IDX]\n", "\n", "X_test_without_exp = shuffled_dataset_wo_exp[SPLIT_IDX:]\n", "X_test = shuffled_dataset[SPLIT_IDX:]\n", - "Y_test = shuffled_labelset[SPLIT_IDX:]\n", - "\n", - "print(X_train)\n", - "print(X_test)" + "Y_test = shuffled_labelset[SPLIT_IDX:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Baseline models\n", "To have a upper bound for the performance of our models, we introduce two baseline models. The first one consists in predicting always 0 interventions." ] }, { "cell_type": "code", - "execution_count": 178, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean squared error: 91.79\n" ] } ], "source": [ "n = Y_test.shape\n", "test_predict_baseline_zero = np.zeros(n)\n", "baseline_zero_mse = mean_squared_error(Y_test, test_predict_baseline_zero)\n", "print('Mean squared error: %.2f'\n", " % baseline_zero_mse)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another baseline is to predict for each country the average number of interventions done in the training data meetings." ] }, { "cell_type": "code", - "execution_count": 179, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Mean squared error: 29.34\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - ":10: RuntimeWarning: invalid value encountered in double_scalars\n", - " avg = np.sum(train_samples_this_country) / len(train_samples_this_country)\n" + "Mean squared error: 29.33\n" ] } ], "source": [ "test_predict_baseline_avg = np.zeros(n)\n", "\n", "# fill in a list with the averages\n", "intervention_averages = []\n", " \n", "# predict accordingly\n", "for i in range(len(countries)):\n", " index = FIRST_COUNTRY_INDEX + i\n", - " train_samples_this_country = (Y_train[X_train_without_exp[:,index] == 1])\n", + " train_samples_this_country = (Y_train[X_train[:,index] == 1])\n", " avg = np.sum(train_samples_this_country) / len(train_samples_this_country)\n", " intervention_averages.append(avg)\n", " \n", - " test_predict_baseline_avg[X_test_without_exp[:,index] == 1] = avg\n", + " test_predict_baseline_avg[X_test[:,index] == 1] = avg\n", " \n", "baseline_avg_mse = mean_squared_error(Y_test, test_predict_baseline_avg)\n", "print('Mean squared error: %.2f'\n", " % baseline_avg_mse)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As expected, this result is equal to a simple linear model without global bias that only works on the country data." ] }, { "cell_type": "code", - "execution_count": 180, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Mean squared error: 29.34\n" + "Mean squared error: 29.33\n" ] } ], "source": [ "# baseline data\n", - "X_train_baseline = X_train_without_exp[:,FIRST_COUNTRY_INDEX:]\n", - "X_test_baseline = X_test_without_exp[:,FIRST_COUNTRY_INDEX:]\n", + "X_train_baseline = X_train[:,FIRST_COUNTRY_INDEX:]\n", + "X_test_baseline = X_test[:,FIRST_COUNTRY_INDEX:]\n", "\n", "reg = linear_model.LinearRegression(fit_intercept=False)\n", "reg.fit(X_train_baseline, Y_train)\n", "test_predict = reg.predict(X_test_baseline)\n", "intervention_mse = mean_squared_error(Y_test, test_predict)\n", "print('Mean squared error: %.2f'\n", " % intervention_mse)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Ridge regression on all the data without experience\n", "Now, we can train the actual first model, conventional ridge regression that expects a gaussion distribution. We first use crossvalidation to determine the best regularizer." ] }, { "cell_type": "code", - "execution_count": 181, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The best regularizer is lambda = 0.419336104314631\n" + "The best regularizer is lambda = 0.41842885079015846\n" ] } ], "source": [ "# cross validation to determine regularizer\n", - "reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 10000))\n", + "reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 1000))\n", "reg.fit(X_train_without_exp, Y_train)\n", "lambda_ = reg.alpha_\n", "print(f\"The best regularizer is lambda = {lambda_}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, we train the model with the optimal lambda." ] }, { "cell_type": "code", - "execution_count": 182, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[-7.13665522e-02 6.59049181e-03 -3.82415856e-01 -2.49200794e-01\n", - " -5.96202313e-01 -3.34337880e-01 1.39240071e+00 6.48203795e-01\n", - " -6.05408304e-01 -2.55455214e-01 -2.58029214e-03 3.96120613e-02\n", - " -1.21371525e+00 -1.81763955e+00 -4.71823475e-01 -1.19928000e+00\n", - " -1.65658355e+00 -1.44486669e+00 2.72852747e+00 -1.69567035e+00\n", - " 1.64520286e+01 -1.58899348e+00 -1.72531274e+00 -1.62792100e+00\n", - " -1.53839254e+00 -3.43152262e-02 -9.00773667e-01 -4.59189090e-01\n", - " -1.77276678e+00 -1.48538750e+00 -1.34297520e+00 -1.35344568e+00\n", - " 3.70904223e+00 -1.46320692e+00 -1.40881902e+00 8.22983991e+00\n", - " -1.35415175e+00 -1.38506583e+00 -1.11904218e+00 -1.53537790e+00\n", - " -1.62573645e+00 -1.65085517e+00 -1.60246507e+00 1.30237108e+01\n", - " -1.31646533e+00 -1.65961309e+00 -3.17487498e-01 4.56611846e+01\n", - " 1.62308631e+00 -1.89720477e+00 -1.89944629e+00 -1.51664452e+00\n", - " -5.98833897e-01 -1.99760291e+00 -1.22638417e+00 -1.30883326e+00\n", - " -1.54623568e+00 -1.63871639e+00 -1.91491266e+00 -1.62548996e+00\n", - " -1.74149657e+00 -1.66836302e+00 -1.39977936e+00 -7.75753622e-01\n", - " 3.71729826e-01 -1.18154897e+00 -1.45133043e+00 -1.82431703e+00\n", - " -1.83756277e+00 -1.63914199e+00 -1.50201217e+00 -1.51266739e+00\n", - " -1.63593244e+00 -1.04564981e+00 -1.38367796e+00 -7.80577746e-01\n", - " -1.62871987e+00 -1.05452013e+00 -4.81506910e-01 -1.81112694e+00\n", - " -5.04965074e-01 -1.17296295e+00 -1.88750128e+00 -1.79040036e+00\n", - " -1.52185828e+00 -1.60444087e+00 -1.40588319e+00 -1.06748239e+00\n", - " -5.28393774e-01 7.49258387e+00 6.30282062e-01 1.04425604e+00\n", - " -1.23538695e+00 -1.71901578e+00 -1.72585761e+00 -1.53174640e+00\n", - " -1.40177938e+00 1.52850627e+01 -1.50455585e+00 -8.54675399e-01\n", - " -7.26237676e-01 -1.44013805e+00 6.44439599e-01 -1.44783051e+00\n", - " -1.80899972e+00 -1.73037459e+00 -1.55653188e+00 -1.83419082e+00\n", - " -1.58250059e+00 -1.38615814e+00 -1.57010087e+00 -1.66652533e+00\n", - " -1.62857853e+00 -1.55513571e+00 -1.64774881e+00 -1.53945506e+00\n", - " 8.31808039e-01 -1.38658389e+00 -1.43398938e+00 -1.64096458e+00\n", - " 9.43828979e-01 -1.17458417e+00 -1.06147845e+00 1.09377121e+00\n", - " -3.50140199e-01 -1.74815485e+00 -1.74717215e+00 -1.82251503e+00\n", - " -1.45717243e+00 -1.72302007e+00 -1.66048684e+00 -1.71442873e+00\n", - " -1.40458916e+00 -1.44130242e+00 -1.24446076e+00 -1.02825189e+00\n", - " 7.82144721e+00 -6.88467863e-01 -1.89769936e+00 1.10854846e-01\n", - " -1.69374660e+00 -1.67795739e+00 7.84542221e+00 -1.12068270e+00\n", - " -1.46719093e-01 -1.55211538e+00 -9.39822493e-01 -1.15998707e+00\n", - " 9.00872296e-01 -1.41617035e+00 -1.87602392e-01 2.96118894e+00\n", - " -8.19396382e-01 -1.82534792e+00 -1.03493027e+00 -1.54117375e+00\n", - " 5.21959225e+00 -1.61701487e+00 9.64523625e-02 -1.85674512e+00\n", - " -1.69482740e+00 1.32121200e+01 -7.17668833e-01 -1.51180882e+00\n", - " -1.66651065e+00 -1.36264943e+00 -3.65314411e-01 -1.72872864e+00\n", - " -1.28398657e+00 -1.53888132e+00 -1.37452187e+00 5.01054002e+00\n", - " 5.05474239e-01 -1.12217817e+00 -1.21344928e+00 -1.52424357e+00\n", - " -1.63409783e+00 -1.43098662e+00 -1.68290784e+00 -1.15364857e+00\n", - " -1.61478000e+00 -1.50666805e+00 8.64771919e+00 -1.45828066e+00\n", - " -1.39802297e+00 -2.32850512e-01 -7.15311709e-01 -1.07000150e+00\n", - " -1.76413141e+00 -1.65429239e+00 -1.12322938e+00 -1.72944664e+00\n", - " -7.59991408e-01 -1.77649064e+00 3.90523625e+00 -3.46630360e-01\n", - " -8.46931399e-01 -1.08876322e+00 -2.05607745e+00 5.30957789e+01\n", - " -3.93589828e-01 -1.69549627e+00 -1.77042953e+00 -1.88665146e+00\n", - " 1.32445023e+00 -1.55658726e+00 -1.68391871e+00 -1.15687231e+00\n", - " -7.88761082e-01 -2.25610190e+00 0.00000000e+00 0.00000000e+00\n", - " 0.00000000e+00 0.00000000e+00]\n" + "[-7.13642429e-02 6.58946776e-03 -3.82433989e-01 -2.49429907e-01\n", + " -5.96414687e-01 -3.34486906e-01 1.39375847e+00 6.47823435e-01\n", + " -6.05626958e-01 -2.55623447e-01 -2.60952679e-03 3.95918137e-02\n", + " -1.21377146e+00 -1.81767521e+00 -4.71824154e-01 -1.19961948e+00\n", + " -1.65663683e+00 -1.44488124e+00 2.72858263e+00 -1.69571753e+00\n", + " 1.64524129e+01 -1.58900153e+00 -1.72537838e+00 -1.62797160e+00\n", + " -1.53844956e+00 -3.42891263e-02 -9.00781289e-01 -4.59202070e-01\n", + " -1.77278387e+00 -1.48543371e+00 -1.34299708e+00 -1.35349655e+00\n", + " 3.70911272e+00 -1.46326714e+00 -1.40883620e+00 8.23009830e+00\n", + " -1.35422893e+00 -1.38508140e+00 -1.11907158e+00 -1.53542728e+00\n", + " -1.62580022e+00 -1.65087519e+00 -1.60252097e+00 1.30240996e+01\n", + " -1.31650172e+00 -1.65965215e+00 -3.17488981e-01 4.56621993e+01\n", + " 1.62312763e+00 -1.89724934e+00 -1.89948799e+00 -1.51666065e+00\n", + " -5.98840741e-01 -1.99762987e+00 -1.22642821e+00 -1.30885382e+00\n", + " -1.54629655e+00 -1.63875425e+00 -1.91495061e+00 -1.62549591e+00\n", + " -1.74153932e+00 -1.66840866e+00 -1.39982960e+00 -7.75761703e-01\n", + " 3.71759205e-01 -1.18158657e+00 -1.45139592e+00 -1.82436479e+00\n", + " -1.83757737e+00 -1.63917996e+00 -1.50203893e+00 -1.51269781e+00\n", + " -1.63591116e+00 -1.04561859e+00 -1.38372966e+00 -7.80581567e-01\n", + " -1.62876818e+00 -1.05448387e+00 -4.81512312e-01 -1.81114302e+00\n", + " -5.04974347e-01 -1.17298904e+00 -1.88757119e+00 -1.79047334e+00\n", + " -1.52190023e+00 -1.60448353e+00 -1.40594681e+00 -1.06750718e+00\n", + " -5.28399045e-01 7.49276801e+00 6.30359904e-01 1.04433777e+00\n", + " -1.23543153e+00 -1.71903051e+00 -1.72589773e+00 -1.53173339e+00\n", + " -1.40181466e+00 1.52854831e+01 -1.50459493e+00 -8.54686597e-01\n", + " -7.26223162e-01 -1.44016893e+00 6.44598330e-01 -1.44790264e+00\n", + " -1.80902728e+00 -1.73040640e+00 -1.55656881e+00 -1.83424344e+00\n", + " -1.58254783e+00 -1.38620048e+00 -1.57015830e+00 -1.66656551e+00\n", + " -1.62861822e+00 -1.55519089e+00 -1.64780335e+00 -1.53947034e+00\n", + " 8.31876980e-01 -1.38660348e+00 -1.43404338e+00 -1.64099921e+00\n", + " 9.43863715e-01 -1.17462779e+00 -1.06151389e+00 1.09378073e+00\n", + " -3.50150316e-01 -1.74820156e+00 -1.74724953e+00 -1.82254524e+00\n", + " -1.45728350e+00 -1.72301180e+00 -1.66051204e+00 -1.71446044e+00\n", + " -1.40461973e+00 -1.44136632e+00 -1.24447239e+00 -1.02824441e+00\n", + " 7.82166535e+00 -6.88488114e-01 -1.89775217e+00 1.10945218e-01\n", + " -1.69381285e+00 -1.67813375e+00 7.84567195e+00 -1.12071394e+00\n", + " -1.46714631e-01 -1.55216505e+00 -9.39937768e-01 -1.16002852e+00\n", + " 9.00907200e-01 -1.41621122e+00 -1.87597216e-01 2.96127915e+00\n", + " -8.19373997e-01 -1.82536960e+00 -1.03491066e+00 -1.54119289e+00\n", + " 5.21971752e+00 -1.61707535e+00 9.64571438e-02 -1.85697260e+00\n", + " -1.69489468e+00 1.32126052e+01 -7.17685890e-01 -1.51187939e+00\n", + " -1.66654440e+00 -1.36268284e+00 -3.65299724e-01 -1.72874729e+00\n", + " -1.28401247e+00 -1.53892082e+00 -1.37462923e+00 5.01072537e+00\n", + " 5.05612917e-01 -1.12228025e+00 -1.21343874e+00 -1.52425989e+00\n", + " -1.63414787e+00 -1.43100126e+00 -1.68297071e+00 -1.15365286e+00\n", + " -1.61481048e+00 -1.50666642e+00 8.64789477e+00 -1.45831372e+00\n", + " -1.39809723e+00 -2.32839419e-01 -7.15218490e-01 -1.07007246e+00\n", + " -1.76416753e+00 -1.65434178e+00 -1.12325176e+00 -1.72949626e+00\n", + " -7.59980720e-01 -1.77653753e+00 3.90534733e+00 -3.46604501e-01\n", + " -8.46975381e-01 -1.08871290e+00 -2.05608339e+00 5.30970083e+01\n", + " -3.93592744e-01 -1.69549868e+00 -1.77047873e+00 -1.88737188e+00\n", + " 1.32449924e+00 -1.55660831e+00 -1.68399117e+00 -1.15687968e+00\n", + " -7.88748835e-01 -2.25608474e+00]\n" ] } ], "source": [ "reg = linear_model.Ridge(alpha=lambda_)\n", "reg.fit(X_train_without_exp, Y_train)\n", "w0 = reg.intercept_\n", "W = reg.coef_\n", "#print(w0)\n", "print(W)" ] }, { "cell_type": "code", - "execution_count": 183, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[ 0.92528263 -0.15941965 1.64093246 ... 11.14722766 -0.09032359\n", - " 47.2102156 ]\n", "Mean squared error: 29.27\n" ] } ], "source": [ "test_predict = reg.predict(X_test_without_exp)\n", - "print(test_predict)\n", "print('Mean squared error: %.2f'\n", " % mean_squared_error(Y_test, test_predict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As we notice, the mean square error is basically equal to the one in the baseline model. The additional dimensions do not help to improve the model." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Linear Model with logarithmic transformation\n", "The reason for the failure of the normal linear model is that our Y doesn't follow a Gaussian distribution, but a logarithmic. We thus try the same model but with a transformed Y. We transform the y's accordingly: \n", "$y' = log(c + y)$ with $c > 0$.\n", "We try different values for c. First, c = 1 (which preserves y = 0 to be y' = 0)" ] }, { "cell_type": "code", "execution_count": 200, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "c = 10\n", "Lambda is 0.3447764054734464\n", "[2.36003016 2.29673558 2.41597337 ... 2.98163425 2.30944377 3.87097441]\n", "[ 0.59127089 -0.05832438 1.20066747 ... 9.72001773 0.06882252\n", " 37.98912433]\n", "[2.30258509 2.30258509 2.39789527 ... 2.30258509 2.30258509 3.8501476 ]\n", "[ 0. 0. 1. ... 0. 0. 37.]\n", "Mean squared error : 34.31\n", "c = 1000\n", "Lambda is 0.4070142453219439\n", "[6.90866435 6.90759974 6.90938658 ... 6.91880515 6.90767333 6.9534976 ]\n", "[ 0.9094879 -0.15552457 1.63262819 ... 11.11114723 -0.08194264\n", " 46.80463149]\n", "[6.90775528 6.90775528 6.90875478 ... 6.90775528 6.90775528 6.94408721]\n", "[ 0. 0. 1. ... 0. 0. 37.]\n", "Mean squared error : 29.44\n", "c = 2000\n", "Lambda is 0.4070142453219439\n", "[7.60136086 7.60082379 7.60172055 ... 7.606452 7.60085931 7.62413461]\n", "[ 0.91701708 -0.15734237 1.63685623 ... 11.12993688 -0.08628893\n", " 47.00824055]\n", "[7.60090246 7.60090246 7.60140233 ... 7.60090246 7.60090246 7.61923342]\n", "[ 0. 0. 1. ... 0. 0. 37.]\n", "Mean squared error : 29.35\n", "c = 5000\n", "Lambda is 0.41842885079015846\n", "[8.51737757 8.51716146 8.51752098 ... 8.51941864 8.51717549 8.52657378]\n", "[ 0.92196911 -0.15863347 1.63922619 ... 11.13962064 -0.08850618\n", " 47.12363259]\n", "[8.51719319 8.51719319 8.51739317 ... 8.51719319 8.51719319 8.52456595]\n", "[ 0. 0. 1. ... 0. 0. 37.]\n", "Mean squared error : 29.30\n", "c = 10000\n", "Lambda is 0.41842885079015846\n", "[9.21043273 9.21032447 9.21050437 ... 9.2114541 9.21033143 9.215046 ]\n", "[ 0.92360649 -0.15901823 1.64008583 ... 11.14348634 -0.08942983\n", " 47.16715676]\n", "[9.21034037 9.21034037 9.21044037 ... 9.21034037 9.21034037 9.21403354]\n", "[ 0. 0. 1. ... 0. 0. 37.]\n", "Mean squared error : 29.29\n", "c = 20000\n", "Lambda is 0.41842885079015846\n", "[9.90353377 9.90347959 9.90356958 ... 9.90404467 9.90348306 9.90584423]\n", "[ 0.92443436 -0.15921203 1.64051656 ... 11.14542617 -0.08989534\n", " 47.18910678]\n", "[9.90348755 9.90348755 9.90353755 ... 9.90348755 9.90348755 9.90533584]\n", "[ 0. 0. 1. ... 0. 0. 37.]\n", "Mean squared error : 29.28\n" ] } ], "source": [ "cs = [10, 1000, 2000, 5000, 10000, 20000]\n", "\n", "for c in cs:\n", " print(f\"c = {c}\")\n", " # logarithmic transformation\n", " Y_train_transf = np.log(c + Y_train)\n", " Y_test_transf = np.log(c + Y_test)\n", "\n", " # crossvalidation for lambda\n", " reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 1000))\n", " reg.fit(X_train_without_exp, Y_train_transf)\n", " lambda_ = reg.alpha_\n", " print(f\"Lambda is {lambda_}\")\n", " test_predict_transf = reg.predict(X_test_without_exp)\n", " # transform the output back\n", " test_predict = np.exp(test_predict_transf) - c\n", " \n", " print(test_predict_transf)\n", " print(test_predict)\n", " print(Y_test_transf)\n", " print(Y_test)\n", "\n", " print('Mean squared error : %.2f'\n", " % mean_squared_error(Y_test, test_predict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To compare, we do the baseline data (only countries) with the optimal c that we found" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean squared error : 44.81\n" ] } ], "source": [ "c = 0.3 # the optimal found above\n", "\n", "# logarithmic transformation\n", "Y_train_transf = np.log(c + Y_train)\n", "Y_test_transf = np.log(c + Y_test)\n", "\n", "# crossvalidation for lambda\n", "reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 1000))\n", "reg.fit(X_train_baseline, Y_train_transf)\n", "lambda_ = reg.alpha_\n", "test_predict_transf = reg.predict(X_test_baseline)\n", "# transform the output back\n", "test_predict = np.exp(test_predict_transf) - c\n", "\n", "print('Mean squared error : %.2f'\n", " % mean_squared_error(Y_test, test_predict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Combine multiple models\n", "Even with the logistic transformation, our data doesn't follow a Gaussian distribution. One problem is that there are a lot of samples with value 0. Several papers suggest for situations like that to first perform a classifying task that decides whether a sample is 0 or not, and then apply a second model. (e.g. https://www.kent.ac.uk/smsas/personal/msr/webfiles/zip/ibc_fin.pdf) As we work with count data, Poisson distribution may fit our data better.\\\n", "Hence, we first apply logistic regression to classify into 0 and non 0." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n", "C:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n", "C:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n", "C:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n", "C:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n", "C:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n", "C:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n", "C:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] } ], "source": [ "Y_train_class = Y_train > 0\n", "Y_test_class = Y_test > 0\n", "\n", "clf = linear_model.LogisticRegression(max_iter=2000, fit_intercept=True)\n", "\n", "# do crossvalidation\n", "params = {'C': np.logspace(-3, 3, 100)}\n", "cv = model_selection.GridSearchCV(clf, params)\n", "cv.fit(X_train_without_exp, Y_train_class)\n", "print(cv.best_params_)\n", "\n", "predict_class = cv.predict(X_test_without_exp)\n", "print(1 - np.mean(predict_class))\n", "print(1 - np.mean(Y_test_class))\n", "accuracy = accuracy_score(Y_test_class, predict_class)\n", "print(f\"accuracy = {accuracy}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(Note: only 78.3% accuracy on training data.)\\\n", "Now, on the data that is not zero, we apply a Poission regressor." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\"\"\"X_train_poiss = X_train[clf.predict(X_train) == 1]\n", "Y_train_poiss = Y_train[clf.predict(X_train) == 1]\n", "X_test_poiss = X_test[clf.predict(X_test) == 1]\n", "Y_test_poiss = Y_test[clf.predict(X_test) == 1]\"\"\"\n", "X_train_poiss = X_train_without_exp[Y_train_class == 1]\n", "Y_train_poiss = Y_train[Y_train_class == 1]\n", "X_test_poiss = X_test[cv.predict(X_test) == 1]\n", "Y_test_poiss = Y_test[cv.predict(X_test) == 1]\n", "\n", "# log transformation\n", "c = 100 # TODO try others\n", "Y_train_poiss = np.log(c + Y_train_poiss)\n", "Y_test_poiss = np.log(c + Y_test_poiss)\n", "\n", "params = {'alpha': np.logspace(-4, 1, 100)}\n", "reg_base = linear_model.PoissonRegressor(max_iter=1000)\n", "reg = model_selection.GridSearchCV(reg_base, params)\n", "# reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 1000))\n", "reg.fit(X_train_poiss, Y_train_poiss)\n", "print(reg.best_params_)\n", "predict_poiss = reg.predict(X_test_poiss)\n", "\n", "predict_poiss = np.exp(predict_poiss) - c\n", "print('Mean squared error on Poisson data : %.2f'\n", " % mean_squared_error(np.exp(Y_test_poiss) - c, predict_poiss))\n", "print((np.exp(Y_test_poiss) - c)[:20])\n", "print(predict_poiss[:20])\n", "\n", "# on everything\n", "predict = np.zeros(Y_test.shape)\n", "print(cv.predict(X_test_without_exp)[:20])\n", "print(Y_test[:20])\n", "predict[cv.predict(X_test_without_exp) == 1] = predict_poiss\n", "print(predict[:20])\n", "\n", "print('Mean squared error : %.2f'\n", " % mean_squared_error(Y_test, predict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Models with experience data\n", "We now build the same models but including the data about the experience of participants.\n", "\n", "First we build a trivial linear model." ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The best regularizer is lambda = 0.4310855408791511\n" ] } ], "source": [ "# cross validation to determine regularizer\n", "reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 10000))\n", "reg.fit(X_train, Y_train)\n", "lambda_ = reg.alpha_\n", "print(f\"The best regularizer is lambda = {lambda_}\")" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2.722046481720346\n", "[-8.43473593e-02 6.26960317e-03 -3.98085674e-01 -2.90482130e-01\n", " -5.12914835e-01 -3.71611114e-01 1.34242274e+00 6.49170731e-01\n", " -5.44319820e-01 -2.72265577e-01 -2.89018243e-03 9.55562443e-02\n", " 6.49635158e-02 -4.68683931e-02 6.65223015e-01 -9.84437091e-01\n", " -1.80362446e+00 -5.12692191e-01 -9.91806006e-01 -1.68002152e+00\n", " -1.53586192e+00 2.68399798e+00 -1.82601330e+00 1.64301497e+01\n", " -1.70359009e+00 -1.58728094e+00 -1.75394356e+00 -1.34535741e+00\n", " 1.10495209e-01 -9.26532156e-01 -3.82603590e-01 -1.89918146e+00\n", " -1.51184759e+00 -1.41230163e+00 -1.42684129e+00 3.74873110e+00\n", " -1.52708101e+00 -1.46017817e+00 8.08336879e+00 -1.28808406e+00\n", " -1.38479050e+00 -1.13317333e+00 -1.54291836e+00 -1.57391126e+00\n", " -1.70068092e+00 -1.55003997e+00 1.29949447e+01 -1.28224213e+00\n", " -1.75957248e+00 -3.46347655e-01 4.55078495e+01 1.62410914e+00\n", " -1.90951002e+00 -1.95422864e+00 -1.53798071e+00 -6.34843829e-01\n", " -2.09753123e+00 -1.25685698e+00 -1.36027213e+00 -1.66825055e+00\n", " -1.70448231e+00 -1.94765895e+00 -1.69766898e+00 -1.78561157e+00\n", " -1.60419197e+00 -1.40494019e+00 -6.96473887e-01 3.99984755e-01\n", " -1.11479103e+00 -1.29756435e+00 -1.72272990e+00 -1.90810839e+00\n", " -1.66195359e+00 -1.46676751e+00 -1.46267946e+00 -1.78615344e+00\n", " -1.17752944e+00 -1.37495124e+00 -7.94814627e-01 -1.64157573e+00\n", " -1.11586971e+00 -4.94737185e-01 -1.88415981e+00 -4.46839462e-01\n", " -1.24743318e+00 -1.93622801e+00 -1.84826398e+00 -1.34628300e+00\n", " -1.61211143e+00 -1.34232147e+00 -1.06422000e+00 -5.80303602e-01\n", " 7.48796414e+00 5.25168406e-01 1.07299085e+00 -1.13009502e+00\n", " -1.78922911e+00 -1.71047559e+00 -1.67376033e+00 -1.37914625e+00\n", " 1.52506273e+01 -1.45948639e+00 -8.00040238e-01 -7.59025274e-01\n", " -1.39121627e+00 6.10507397e-01 -1.41321334e+00 -1.87197330e+00\n", " -1.77123296e+00 -1.55599978e+00 -1.83748479e+00 -1.54290840e+00\n", " -1.29645972e+00 -1.69492558e+00 -1.65978325e+00 -1.69882178e+00\n", " -1.40547454e+00 -1.65781563e+00 -1.50002388e+00 8.02561464e-01\n", " -1.45477912e+00 -1.50131342e+00 -1.61676080e+00 9.57403312e-01\n", " -1.17694765e+00 -1.07033624e+00 1.03583989e+00 -4.34109768e-01\n", " -1.85113699e+00 -1.75541627e+00 -1.71456365e+00 -1.36478911e+00\n", " -1.78648122e+00 -1.67076913e+00 -1.47124550e+00 -1.46762853e+00\n", " -1.43353122e+00 -1.15059462e+00 -1.03921150e+00 7.77069994e+00\n", " -6.52537753e-01 -1.95359579e+00 4.92766321e-03 -1.63400977e+00\n", " -1.37799849e+00 7.73507960e+00 -1.07260906e+00 -5.51386965e-02\n", " -1.45188231e+00 -5.80168754e-01 -1.20330261e+00 9.09892623e-01\n", " -1.35067420e+00 -1.67099026e-01 2.93620071e+00 -8.90215432e-01\n", " -1.88844485e+00 -9.98748467e-01 -1.57422992e+00 5.19862332e+00\n", " -1.65280877e+00 5.05719830e-02 -1.64993873e+00 -1.73851840e+00\n", " 1.31559722e+01 -8.57390932e-01 -1.51660865e+00 -1.70640892e+00\n", " -1.32325122e+00 -4.10204037e-01 -1.76533400e+00 -1.32027774e+00\n", " -1.60915427e+00 -9.52788926e-01 5.03475436e+00 4.64582740e-01\n", " -9.04880738e-01 -1.27639527e+00 -1.39169112e+00 -1.72193367e+00\n", " -1.45730161e+00 -1.62931366e+00 -1.14442318e+00 -1.47834389e+00\n", " -1.60558557e+00 8.53798000e+00 -1.29385700e+00 -1.39512784e+00\n", " -2.46724543e-01 -7.95448169e-01 -1.05882990e+00 -1.77374134e+00\n", " -1.48409808e+00 -1.17958151e+00 -1.84262886e+00 -6.26374735e-01\n", " -1.73393168e+00 3.82567364e+00 -4.48226284e-01 -8.15663029e-01\n", " -1.05530367e+00 -2.12943893e+00 5.29590505e+01 -4.63005762e-01\n", " -1.70223872e+00 -1.72729716e+00 -1.63683781e+00 1.29198850e+00\n", " -1.60115424e+00 -1.63197052e+00 -1.14126707e+00 -8.96069743e-01\n", " -2.18061272e+00 0.00000000e+00]\n" ] } ], "source": [ "reg = linear_model.Ridge(alpha=lambda_)\n", "reg.fit(X_train, Y_train)\n", "w0 = reg.intercept_\n", "W = reg.coef_\n", "print(w0)\n", "print(W)" ] }, { "cell_type": "code", "execution_count": 188, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 1.18297512e+00 -7.42407867e-01 1.65214044e+00 -2.46034308e-02\n", " -4.69021814e-01 8.68726653e-01 6.72104627e+00 8.29711110e-01\n", " 1.30591417e+00 3.87780478e-01 2.73052330e-02 -8.52067029e-01\n", " 2.42915316e+00 6.22925442e-03 9.82701624e-01 1.85688847e+00\n", " 5.14099547e-01 6.61749796e-01 -1.34684735e-01 -1.17630981e-01]\n", "[0. 0. 1. 0. 0. 1. 6. 0. 4. 0. 0. 1. 2. 0. 5. 0. 1. 0. 0. 0.]\n", "Mean squared error: 29.32\n" ] } ], "source": [ "test_predict = reg.predict(X_test)\n", "print(test_predict[:20])\n", "print(Y_test[:20])\n", "print('Mean squared error: %.2f'\n", " % mean_squared_error(Y_test, test_predict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Second, we do the linear model with a logistic transformation." ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "c = 10\n", "Lambda: 0.3544455673970436\n", "[2.36868331 2.27255708 2.41760197 ... 2.97955573 2.31652432 3.87262415]\n", "[2.30258509 2.30258509 2.39789527 ... 2.30258509 2.30258509 3.8501476 ]\n", "[ 0.68331637 -0.29581649 1.21892376 ... 9.67907192 0.14036829\n", " 38.06835913]\n", "[ 0. 0. 1. ... 0. 0. 37.]\n", "sanity\n", "[[23. 10. 0. ... 0. 0. 0.]\n", " [15. 2. 1. ... 0. 0. 0.]\n", " [14. 13. 0. ... 0. 0. 0.]\n", " ...\n", " [21. 11. 1. ... 0. 0. 0.]\n", " [18. 10. 0. ... 0. 0. 0.]\n", " [24. 6. 1. ... 0. 0. 0.]]\n", "Mean squared error : 34.32\n", "c = 100\n", "Lambda: 0.3643858983763548\n", "[4.61541371 4.59918935 4.62094193 ... 4.70766727 4.60605685 4.97402359]\n", "[4.60517019 4.60517019 4.61512052 ... 4.60517019 4.60517019 4.91998093]\n", "[ 1.02961728 -0.59629879 1.58967752 ... 10.79340722 0.08870552\n", " 44.60755927]\n", "[ 0. 0. 1. ... 0. 0. 37.]\n", "sanity\n", "[[23. 10. 0. ... 0. 0. 0.]\n", " [15. 2. 1. ... 0. 0. 0.]\n", " [14. 13. 0. ... 0. 0. 0.]\n", " ...\n", " [21. 11. 1. ... 0. 0. 0.]\n", " [18. 10. 0. ... 0. 0. 0.]\n", " [24. 6. 1. ... 0. 0. 0.]]\n", "Mean squared error : 30.59\n", "c = 500\n", "Lambda: 0.4070142453219439\n", "[6.2168895 6.21320115 6.21788416 ... 6.23641285 6.21472897 6.30389222]\n", "[6.2146081 6.2146081 6.2166061 ... 6.2146081 6.2146081 6.28599809]\n", "[ 1.14200538 -0.70298117 1.64071919 ... 11.02210731 0.06044109\n", " 46.69563152]\n", "[ 0. 0. 1. ... 0. 0. 37.]\n", "sanity\n", "[[23. 10. 0. ... 0. 0. 0.]\n", " [15. 2. 1. ... 0. 0. 0.]\n", " [14. 13. 0. ... 0. 0. 0.]\n", " ...\n", " [21. 11. 1. ... 0. 0. 0.]\n", " [18. 10. 0. ... 0. 0. 0.]\n", " [24. 6. 1. ... 0. 0. 0.]]\n", "Mean squared error : 29.64\n", "c = 1000\n", "Lambda: 0.41842885079015846\n", "[6.90891612 6.90703328 6.90940047 ... 6.91874894 6.90781098 6.95374095]\n", "[6.90775528 6.90775528 6.90875478 ... 6.90775528 6.90775528 6.94408721]\n", "[ 1.16151439 -0.72173879 1.64655001 ... 11.05431636 0.05569874\n", " 47.0594116 ]\n", "[ 0. 0. 1. ... 0. 0. 37.]\n", "sanity\n", "[[23. 10. 0. ... 0. 0. 0.]\n", " [15. 2. 1. ... 0. 0. 0.]\n", " [14. 13. 0. ... 0. 0. 0.]\n", " ...\n", " [21. 11. 1. ... 0. 0. 0.]\n", " [18. 10. 0. ... 0. 0. 0.]\n", " [24. 6. 1. ... 0. 0. 0.]]\n", "Mean squared error : 29.49\n" ] } ], "source": [ "cs = [10, 100, 500, 1000]\n", "\n", "for c in cs:\n", " print(f\"c = {c}\")\n", " # logarithmic transformation\n", " Y_train_transf = np.log(c + Y_train)\n", " Y_test_transf = np.log(c + Y_test)\n", " \n", " # crossvalidation for lambda\n", " reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 1000))\n", " reg.fit(X_train, Y_train_transf)\n", " print(f\"Lambda: {reg.alpha_}\")\n", " test_predict_transf = reg.predict(X_test)\n", " # transform the output back\n", " test_predict = np.exp(test_predict_transf) - c\n", " \n", " print(test_predict_transf)\n", " print(Y_test_transf)\n", " print(test_predict)\n", " print(Y_test)\n", " \n", " print('Mean squared error : %.2f'\n", " % mean_squared_error(Y_test, test_predict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Last, we apply the two step model with Logistic Regression and the Poisson Regressor." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/report/data_processing.tex b/report/data_processing.tex index 4665784..cb30226 100644 --- a/report/data_processing.tex +++ b/report/data_processing.tex @@ -1,381 +1,381 @@ \section{Data Extraction and Processing} The main part of this project is to extract the information contained in the participant lists of UNFCCC meetings. We explain in this chapter how we extracted and processed the data of the PDF lists. \subsection{Data Extraction} This section describes task one of the project, i.e., how we extract the data from the PDF participant lists. The first step consists of transforming the available PDF files into text files. It is important to keep some information about the structure of the text to be able to find the relevant data in the resulting text file. The second step consists of transforming the text files to comma-separated values (CSV) files. The result of this task is a CSV file for each processed participant list that contains the entries \textit{affiliation category, affiliation, name, description}. \subsubsection{Raw Dataset} \label{dataset} We download the participant lists from the document webpage of the UNFCCC secretariat. \cite{UNFCCC_docs} The lists we process contain all the COP meetings and almost all the SB meetings. Note that during a COP, there is mostly an SB meeting held in parallel for which there is no separate participant list. \\ A participant list has the following general structure: Participants are listed under the affiliation they belong to. A member of the Swiss government is for example listed under the affiliation “Switzerland”. A participant is attributed a salutary address that contains at least "Mr." or "Ms.", but may also contain some titles as "H.E." (i.e. "Her Excellence"). Some participants, but not all of them, are attributed a description that explains their role within the delegation. This description could for example be "Minister of Foreign Affairs". Affiliations are sorted according to their affiliation category and then alphabetically. The possible affiliation categories are: \begin{itemize} \item Parties \item Observer States \item United Nations Secretariat units and bodies \item Specialized agencies and related organizations \item Intergovernmental organizations \item Non-governmental organizations \end{itemize} The category "Media" exists for newer participant lists, but the corresponding participants are not listed. We therefore exclude this category. \\ The format of the participant lists varies over time. For the first meetings the participant lists are paper scans, which means that we need to convert images to text. Furthermore, the manner in which affiliations are indicated varies, in the first meetings they are always written in all uppercase letters, which was changed in later meetings. \\ We choose the version of the participant lists that is published during the last days of a meeting. We exclude the corrigenda, documents that are published later for some participant lists and contain corrections of the lists, because their format varies a lot and many of the listed corrections are rather small (change of order of participants within an affiliation, change of descriptions). \subsubsection{Optical Character Recognition} To extract the data from the scanned lists, we use Optical Character Recognition (OCR), more precisely Python-tesseract (pytesseract \cite{pytesseract}). Python-Tesseract is a wrapper for the OCR engine Tesseract developed by Google since 2006. \\ Tesseract works as follows. First, it performs a connected component analysis to find blobs that are organized as text lines. Then, a two-pass process for recognition is applied. % TODO connected component analysis In the first pass, the program tries to recognize each word. If a word is recognized satisfactory, it is used as training data for every word that follows. To make use of all the training data, the second pass goes over all unrecognized words for a second time. \cite{tesseract_expl} \\ The version of tesseract that we use introduces neural nets LSTM. % TODO LSTM neural nets In the dataset of this project, the Tesseract OCR engine fails for some specific pages that contain only sparsely distributed participants without descriptions. To improve the performance of the connected component analysis, we insert half-transparent boxes on pages that encounter this problem. (See Figure \ref{fig:boxes}) This ensures the correct order of names in the resulting text file. \begin{figure}[ht] \caption{Page with an inserted half-transparent box before OCR} \centering \includegraphics[width=0.4\textwidth]{boxes_tesseract.png} \label{fig:boxes} \end{figure} % TODO bigger % TODO change title \subsubsection{Well-formatted PDF Extraction} To extract the data from the well-formatted PDF files, we use a PDF processing package called Pdfminer.six.\cite{pdfminer.six} Again, the main difficulty is to extract the text of the list in correct order. Especially for documents with three columns, this becomes a difficult task. For this reason, we adapted the use of Pdfminer.six by rewriting one of the classes, the \texttt{PDFPageAggregator}. \\ First, we explain quickly how pdfminer extracts text from PDF files. Pdfminer.six perform a layout analysis on every page before extracting the text. This analysis is done in three stages: \begin{itemize} \item Group characters to words and lines \item Group lines to boxes \item Group textboxes hierarchically \end{itemize} The output of the layout analysis is visualized in Figure \ref{fig:pdfminer}.\\ \begin{figure}[ht] \caption{Output of the layout analysis of pdfminer.six} \centering \includegraphics[width=0.9\textwidth]{pdfminer.png} \label{fig:pdfminer} \end{figure} The class we want to modify, \texttt{PDFPageAggregator}, is responsible for outputing the text lines of a page in the determined order. To be able to sort the text lines according to our rules later, we modify the function \texttt{receive\_layout} such that it outputs for each LTTextLine the available $x$ and $y$ positions within the page. In our script that performs the extraction, we then define rules to determine in which column a text line is situated. \\ A special case for the page layout are affiliation category titles. They break the column system in the middle of a page. We therefore need to recognize them by their content and introduce special rules for pages that contain affiliation category titles.\\ Another difficulty is the recognition of new affiliations. Pdfminer.six is not able to get information about font style, so the only way to detect new affiliations is through line breaks and the fact that names are always started with a salutary address. As line breaks are automatically preserved with pdfminer, we encounter problems only in special situations: When a new affiliation is on top of a column and longer than two lines, we can't distinguish it from the description of a previous participant that is divided to two columns. \subsubsection{Extraction from Text Files} We now need to extract the information from the generated text files. We do this with the following procedure: \begin{enumerate} \item Clean the text file from unnecessary elements, e.g. page numbers and page headers. \item Iterate through the rows of the text file and repeatedly apply: \begin{enumerate} \item Check if the current line is the beginning of a new affiliation category. We do this by keyword checking. \item Check if the current line is a new affiliation. We look for format cues like a row in uppercase letters (for early meetings) or lines that are after a double line break and don't start with a salutary address. \item Check if the current line is a new name for the current affiliation by detecting a salutary address. \item If none of the above is the case, add the line to the description of the current participant. \end{enumerate} \end{enumerate} Note that this algorithm fails for participants that do not start with a salutary address. But as this case only happens a few times in all the processed lists, we can neglect this error. \subsection{Data Processing} This section describes task two of the project, i.e., to gain more information from the extracted data. % TODO rewrite The goal is to post-process the CSV files and bring all the meetings together to one dataset that contains more attributes per participants. \subsubsection{Unification of Meetings} In order to make our complete dataset as consistent as possible, we need the same affiliation to be named the same throughout all the meetings. For some earlier meetings, e.g. COP 2, the English version of the participant lists were not available. We therefore processed the French versions of their participant lists. With the help of a dictionary, we translate the names of all the parties to English. Once all the country names are in English, we nevertheless need to unify them to the same country denotation. For example, the party Venezuela is named "Venezuela" in the participant list of COP 6, but "Venezuela (Bolivarian Republic of)" in COP 25. To unify the English country names, we use the python package country-converter. \cite{coco} We use it to change every country name to its "short" name. "Venezuela (Bolivarian Republic of)" then becomes "Venezuela". This packet has some limits when misspelling occur. For example, an error in the OCR process caused Iran to be spelled "Tran" in COP 1, which provokes that the country-converter doesn't recognize it correctly. \\ Note that we applied translation and unification only to the parties. Even if in some earlier meetings there are other affiliations that are written in different ways, the unification would have been more difficult and more error-prone due to the larger number of possible names. We therefore decided not to apply unification for the rest of the participant lists, also including the descriptions. \subsubsection{Gender and Title} The easiest additional attributes to extract are gender and title of participants. This is due to the very static structure of names in the UNFCCC participant lists: Each name starts with a salutary address ("Mr.", "Ms.", "Sr.", "Sra." etc.) that is associated to be either male or female. By simply checking this salutary address, we can extract the gender of each participant. Optionally, the salutary address contains some title like "H.E." ("Her Excellence"), "Dr." or "Prof.". We set a binary attribute \textit{has\_title} to 1 if a participant is listed with such a title, and to 0 otherwise. \subsubsection{Roles} The descriptions of the participants contain more information about the participant, but in a very inconsistent format. Every affiliation can decide what to provide as descriptions of their participants. We extract information out of the descriptions by defining roles. These roles define the role of a participant within its affiliation. \\ We assign a role to a participant by looking for keywords in its description. The following list contains the roles that we look for and some corresponding keywords in order of decreasing priority. If a description contains keywords from more than one role, it's assigned the one with higher priority. \begin{itemize} \item Security (Security Officer, Security Service) \item Diplomacy (Ambassador, Embassy, Diplomatic) \item Government (Ministry, Minister, Government, Parliament, Agency, Department of, European Commission, Presidential Office) \item Press (Journalist, Reporter, Radio, Press) \item Universities (Professor, Researcher, Student, University) \end{itemize} The reason for "Security" having the highest priority is that security service is often provided to people of other roles. With our priority rule, the description "Security Officer of the Minister" would be assigned the role "Security" and not "Government", which is the correct choice. On the other hand, we avoid the keyword "Security" for this role to prevent a description like "Minister for Politics, Law, and Security Affairs" being assigned to the role "Security". \\ \subsubsection{Association to Fossil Fuel Industry} We also use keywords to determine whether a participant is associated to the fossil fuel industry or not. We separate this from the roles as we do not only use the description, but also the affiliation name to check for the keywords. For example, we want to detect all participants of the NGO "Canadian Association of Petroleum Producers" as associated to the fossil fuel industry, even if they don't have any description. \\ It is also an advantage that a participant that is associated with the fossil fuel industry can still have a role. For example, Saudi Arabia has a Ministry of Petroleum. The corresponding Minister is assigned the role "Government" but still is associated with the fossil fuel industry. \subsubsection{Experience} When bringing together the data of all the different meetings, we are interested in the experience of participants. We define experience by the number of earlier UNFCCC meetings that the participant has visited. We differ between experience in SB meetings and COP meetings, as they have quite different characteristics. Furthermore, we differ between experience within a delegation of a Party to the Convention (i.e. category "parties") and experience in a delegation of a non-governmental organization. \\ To determine the experience, we have to compare names throughout different meetings. There are some situations where a plain text comparison would fail, even if it's the same person. \begin{itemize} \item Different spellings of the name, simplification of a special character (e instead of é) \item Long names that span over more than one column are not entirely detected in the newer PDFs because there are three columns in the document. Hence, only a part of the name is detected. % TODO example \item The order of names is swapped (e.g. "Obama Barack" instead of "Barack Obama") \end{itemize} We decided to handle these cases in the following manner: \begin{itemize} \item Allow an edit distance of 1 (see below). \item Consider two names as the same when one starts with the other ("Alexander Van der Bellen" and "Alexander Van der" are considered to be the same person). We exclude names with less than 15 characters from this rule to guarantee that a line break is involved. \item If the set of words of two names are equal, the persons are considered to be the same. \end{itemize} We compute the \textbf{edit distance} between names. There exist several types of edit distances. All of them count the minimum number of operations to get from one string to the other. We need to keep the accepted distance very small to keep the error rate low. With over 130 000 distinct participants, the occurrence of very similar names is probable. To get the property that we want, we need substitution to be allowed, such that a missed special character or a typo can simply be replaced by the correct character. We compare the performance of two edit distances. \\ The \textbf{Hamming distance} only allows substitution, hence the compared strings need to have the same length. It is equal to the number positions at which the symbols differ in the two strings. The \textbf{Levenshtein distance} allows substitution, insertion and deletion. It is equal to the minimum number of single-character edits required to change one string into the other. Mathematically, \begin{equation} \label{levenshtein} lev(a,b) = \begin{cases} \lvert a \rvert & \text{if } \lvert b \rvert = 0 \\ \lvert b \rvert & \text{if } \lvert a \rvert = 0 \\ lev(tail(a), tail(b)) & \text{if } a[0] = b[0] \\ 1 + min \begin{cases} lev(tail(a), b) \\ lev(a, tail(b)) \\ lev(tail(a), tail(b)) \\ \end{cases} & \text{otherwise} \end{cases} \end{equation} where for a string x, tail(x) is the string without the first character and $\lvert x \rvert$ is the length of the string. \\ When comparing the results of Levenshtein distance and Hamming distance on our data, the samples that were additionally found to be the same person by the Levenshtein distance were mostly correct ones. One common case is for example a forgotten apostrophe (e.g. "yaara peretz" and "ya'ara peretz") or a missing empty space (e.g. "yong chul cho" and "yongchul cho"). There are some false positives that are inserted, but this is rather due to common names (e.g. "yan jia" and "yuan jia"). According to the results of this comparison, we choose the Levenshtein distance in the final implementation. \\ To mark false positives, we add another attribute to the dataset that is set to one if for a participant, there has been detected his name twice in one of the earlier meetings. When this flag is set, the experience attributes contain an error. \\ In addition to the attributes for experience that we can add to our dataset, we obtain the information for each participant to which meetings they have participated within which affiliation. \\ Note that a delegation is one instance of an affiliation. Each affiliation comes to a new meeting with a new delegation. To be able to compare delegations with respect to the experience of their participants, we need to define a metric for the experience of a delegation. We call this the \textbf{experience score} of an affiliation and define it as follows: \begin{equation} ExperienceScore(\text{delegation}) = avg(\text{total experience of the top 10 most experienced participants}) \end{equation} The reason for only choosing the top 10 is that delegations are sometimes very big with only a few participants actively involved in the negotiation process. \subsection{Results} We process the participant list of 54 UNFCCC meetings, 26 COPs and 28 SBs. We find in total 271,434 entries. \\ In average, we find 8353 participants per COP meeting and 1949 participants per SB meeting. We show in Figures \ref{fig:cop_overall} and \ref{fig:sb_overall} the total numbers of extracted participants of all the COP and SB meetings respectively. % TODO comment figures! % TODO maybe add number of distinct affiliations (easy). Maybe add plots for some affiliations, some top 10 \begin{figure} \centering \begin{minipage}[ht]{.5\textwidth} \captionsetup{width=.8\linewidth} \captionof{figure}{Overview of the extracted participants of COP meetings} \centering \includegraphics[width=0.6\textwidth]{participants_per_cop.png} \label{fig:cop_overall} \end{minipage}% \begin{minipage}[ht]{.5\textwidth} \captionsetup{width=.8\linewidth} \captionof{figure}{Overview of the extracted participants of SB meetings} \centering \includegraphics[width=0.6\textwidth]{participants_per_sb.png} \label{fig:sb_overall} \end{minipage} \end{figure} \subsubsection{Gender and Title} The proportion of women steadily increased since the first meetings. Starting at a rate of 21.4\% at the first SB meeting in 1995 it reached its temporary peak of 47.3\% at SB 50 in 2019. Figure \ref{fig:gender} shows the continuously increasing trend of this measure, with a slight higher rate of women at the SB meetings compared to the COP meetings. \begin{figure}[ht] \caption{Proportion of female participants per meeting} \centering \includegraphics[width=0.8\textwidth]{gender.png} \label{fig:gender} \end{figure} The UNFCCC secretariat publishes gender composition reports as their goal is to meet gender balance at their meetings, as this may lead to more gender-sensitive climate policies. They show in these reports that even if the numbers are almost reaching 50\% in the latest meetings, equality is not yet reached. The proportion of women is lower when only looking at the Parties to the Convention, and it is also significantly lower when considering the heads of delegations. \cite{UNFCCC_genderreport} \\ The number of participants with a title is generally rather slow. For COP meetings, the average rate of participants with a title is at 3.9\%, for SB meetings the average is at 1.8\%. -\subsubsection{Roles} +\subsubsection{Roles} \label{roles} The assigned roles are mainly of interest for parties, as the descriptions are the most exhaustive for their delegates and also contain more keywords. The Figures \ref{fig:cop_roles} and \ref{fig:sb_roles} show which roles have been found to which percentage in the parties of the meetings for COP and SB meetings respectively. The main role is "Government". The role "no keyword found" in the plots shows the participants that did not match any keyword, the role "no description" contains the participants that didn't have a description in the source document. \begin{figure} \centering \begin{minipage}{.5\textwidth} \captionof{figure}{Assigned roles for COP meetings} \centering \includegraphics[width=1\linewidth]{roles_cop.png} \label{fig:cop_roles} \end{minipage}% \begin{minipage}{.5\textwidth} \captionof{figure}{Assigned roles for SB meetings} \centering \includegraphics[width=1\linewidth]{roles_sb.png} \label{fig:sb_roles} \end{minipage} \end{figure} \subsubsection{Association to Fossil Fuel Industry} The number of detected participants that openly represent the fossil fuel industry varies a lot from meeting to meeting. Figures \ref{fig:cop_fossil} and \ref{fig:sb_fossil} show the absolute numbers of detected fossil fuel industry representants for COP and SB meetings respectively. The average rate of participants with a fossil fuel industry association is 1.7\% for COP meetings and 2.7\% for SB meetings. These rates have decreased over the years as the number of participants has increased. \begin{figure} \centering \begin{minipage}[ht]{.5\textwidth} \captionsetup{width=.8\linewidth} \captionof{figure}{Participants with fossil fuel industry association (COP)} \centering \includegraphics[width=1\linewidth]{ff_cop.png} \label{fig:cop_fossil} \end{minipage}% \begin{minipage}[ht]{.5\textwidth} \captionsetup{width=.8\linewidth} \captionof{figure}{Participants with fossil fuel industry association (SB)} \centering \includegraphics[width=1\linewidth]{ff_sb.png} \label{fig:sb_fossil} \end{minipage} \end{figure} -\subsubsection{Experience} +\subsubsection{Experience} \label{experience} Over all meetings, we find 138,940 distinct participants. \\ We identify 193 persons that have participated to at least half of the 54 processed meetings. The most experienced participants and their affiliation in COP 25 are the following: \begin{enumerate} \item Helmut Hojesky: Austria (26 COP, 27 SB) % TODO victor add more information? \item Norine Kennedy: United States Council for International Business (25 COP, 28 SB) \item Manfred Treber: Germanwatch (26 COP, 26 SB) \end{enumerate} % TODO flow of partipants TODO "Movements of participants" \\ % TODO experience score graphs (include austria!!) We consider our defined Experience Score to compare affiliations according to their experience. Figure \ref{fig:expscore_overview} shows the average Experience Score over all affiliations per meeting. The separation of the bars shows if the experience is more or less gained in COP or SB meetings. \begin{figure}[ht] \caption{Average Experience Score over time} \centering \includegraphics[width=1\textwidth]{experiencescore_overview.png} \label{fig:expscore_overview} \end{figure} TODO plot experience score for some affiliations (include Austria) \ No newline at end of file diff --git a/report/predictive_modelling.tex b/report/predictive_modelling.tex index a767000..6ede844 100644 --- a/report/predictive_modelling.tex +++ b/report/predictive_modelling.tex @@ -1,11 +1,82 @@ \section{Predictive Modelling} \label{predictive_modelling} Having extracted and processed the data contained in the participant lists, we use them to build predictive models for other data. First, we build linear models for the data on interventions at UNFCCC meetings collected by Tatiana Cogne and Victor Kristof (see \ref{tatiana}). Note that we can't go further on this topic due to time constraints, but there is more potential for creating models with our data, especially for the interaction dataset also collected by Tatiana Cogne and Victor Kristof. \subsection{Predict Interventions} +The data on interventions lists for different UNFCCC meetings how many times a party intervenes in this meeting. +We build a model that predicts for a party and a given meeting the number of interventions of the party at this meeting. +Figure \ref{fig:interv_distr} plots the distribution of the interventions, i.e. the distribution of the labels of the complete dataset. +Most parties don't have any intervention or only one, while some parties intervene a lot more. + +\begin{figure}[ht] + \caption{Distribution of the intervention labels} + \centering + \includegraphics[width=0.7\textwidth]{distr_interventions.png} + \label{fig:interv_distr} +\end{figure} + +\subsubsection{Data samples} +We define a data sample $x_i$ as the participation of a party at a meeting. Note that we only consider parties and no other +affiliations as only parties are able to make interventions in the official negotiations. We thus define the structure of +a data sample. + +\begin{equation*} + x_i = + \begin{bmatrix} + year - 1995 \\ + number\_of\_delegates \\ + meeting\_type \\ + government\_rate \\ + diplomacy\_rate \\ + security\_rate \\ + press\_rate \\ + university\_rate \\ + no\_description\_rate \\ + no\_keyword\_rate \\ + nb\_fossil\_fuel\_industry\_associations \\ + woman\_proportion \\ + experience\_score\_cop \\ + experience\_score\_sb \\ + experience\_score\_parties\_rate \\ + is\_Albania \\ + is\_Algeria \\ + \vdots \\ + is\_Zimbabwe \\ + is\_unrecognized\_country + \end{bmatrix} + \in \mathbb{R}^{213} +\end{equation*} + +The attribute \textit{year} is the year the meeting took place and is substracted 1995 which is the year of the first meeting (SB1) +to get values closer to zero. The attribute \textit{meeting\_type} is binary and 0 if the meeting was a COP, 1 if the meeting +was a SB. +The attributes \textit{government\_rate} to textit{no\_keyword\_rate} correspond to the proportion of each role that we assign +(see \ref{roles}). For the experience score, we provide COP and SB experience in total numbers, they sum up to the total +experience score of an affiliation. The \textit{experience\_score\_parties\_rate} denotes the rate of the total experience score +that has been acquired in parties (see \ref{experience}). +The information about the parties are converted into 198 binary attributes, one for each of the 197 Parties to the Convention +and one for an invalid or unrecognized country. \\ +In total, we have 9217 data samples. We randomly pick about 80\% of these samples, i.e. 7400 samples, as our training set. +The resting samples form our test set. + \subsubsection{Models} +% baseline models +We first build two \textbf{baseline models}, such that we are later able to compare our models to those simple models. +The first baseline model consists simply of always predicting zero interventions, as this is the most common label. +The second baseline model consists of computing the average number of interventions a party did over all included meetings +and always predict this average. + +% linear model + + +% linear model with logarithmic transformation + + +% mixed model + -\subsubsection{Results} \ No newline at end of file +\subsubsection{Results} +To be able to quantify the results of our models, we first \ No newline at end of file diff --git a/report/report.aux b/report/report.aux index 9b37ee2..f807e4c 100644 --- a/report/report.aux +++ b/report/report.aux @@ -1,100 +1,105 @@ \relax \providecommand\hyper@newdestlabel[2]{} \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined \global\let\oldcontentsline\contentsline \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} \global\let\oldnewlabel\newlabel \gdef\newlabel#1#2{\newlabelxx{#1}#2} \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} \AtEndDocument{\ifx\hyper@anchor\@undefined \let\contentsline\oldcontentsline \let\newlabel\oldnewlabel \fi} \fi} \global\let\hyper@last\relax \gdef\HyperFirstAtBeginDocument#1{#1} \providecommand\HyField@AuxAddToFields[1]{} \providecommand\HyField@AuxAddToCoFields[2]{} \providecommand\@newglossary[4]{} \@newglossary{main}{glg}{gls}{glo} \providecommand\@glsorder[1]{} \providecommand\@istfilename[1]{} \@istfilename{report.ist} \@glsorder{word} \citation{ipcc:2018} \citation{UNFCCC} \citation{UNFCCC_process} \citation{evolution_UNFCCC} \citation{UNFCCC_process} \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{2}{section.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {1.1}International Climate Negotiations}{2}{subsection.1.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {1.2}Project}{2}{subsection.1.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {1.2.1}Larger Project}{2}{subsubsection.1.2.1}\protected@file@percent } \newlabel{tatiana}{{1.2.1}{2}{Larger Project}{subsubsection.1.2.1}{}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {1.2.2}Our project}{2}{subsubsection.1.2.2}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {2}Data Extraction and Processing}{2}{section.2}\protected@file@percent } \citation{UNFCCC_docs} \citation{pytesseract} \citation{tesseract_expl} \citation{pdfminer.six} \@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Data Extraction}{3}{subsection.2.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.1}Raw Dataset}{3}{subsubsection.2.1.1}\protected@file@percent } \newlabel{dataset}{{2.1.1}{3}{Raw Dataset}{subsubsection.2.1.1}{}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.2}Optical Character Recognition}{3}{subsubsection.2.1.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.3}Well-formatted PDF Extraction}{3}{subsubsection.2.1.3}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Page with an inserted half-transparent box before OCR\relax }}{4}{figure.caption.2}\protected@file@percent } \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} \newlabel{fig:boxes}{{1}{4}{Page with an inserted half-transparent box before OCR\relax }{figure.caption.2}{}} \@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Output of the layout analysis of pdfminer.six\relax }}{4}{figure.caption.3}\protected@file@percent } \newlabel{fig:pdfminer}{{2}{4}{Output of the layout analysis of pdfminer.six\relax }{figure.caption.3}{}} \citation{coco} \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.4}Extraction from Text Files}{5}{subsubsection.2.1.4}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Data Processing}{5}{subsection.2.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.1}Unification of Meetings}{5}{subsubsection.2.2.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.2}Gender and Title}{5}{subsubsection.2.2.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.3}Roles}{5}{subsubsection.2.2.3}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.4}Association to Fossil Fuel Industry}{6}{subsubsection.2.2.4}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.5}Experience}{6}{subsubsection.2.2.5}\protected@file@percent } \citation{UNFCCC_genderreport} \@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Overview of the extracted participants of COP meetings\relax }}{7}{figure.caption.4}\protected@file@percent } \newlabel{fig:cop_overall}{{3}{7}{Overview of the extracted participants of COP meetings\relax }{figure.caption.4}{}} \@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Overview of the extracted participants of SB meetings\relax }}{7}{figure.caption.4}\protected@file@percent } \newlabel{fig:sb_overall}{{4}{7}{Overview of the extracted participants of SB meetings\relax }{figure.caption.4}{}} \newlabel{levenshtein}{{1}{7}{Experience}{equation.2.1}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Results}{7}{subsection.2.3}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.3.1}Gender and Title}{7}{subsubsection.2.3.1}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Proportion of female participants per meeting\relax }}{8}{figure.caption.5}\protected@file@percent } \newlabel{fig:gender}{{5}{8}{Proportion of female participants per meeting\relax }{figure.caption.5}{}} \@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Assigned roles for COP meetings\relax }}{8}{figure.caption.6}\protected@file@percent } \newlabel{fig:cop_roles}{{6}{8}{Assigned roles for COP meetings\relax }{figure.caption.6}{}} \@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces Assigned roles for SB meetings\relax }}{8}{figure.caption.6}\protected@file@percent } \newlabel{fig:sb_roles}{{7}{8}{Assigned roles for SB meetings\relax }{figure.caption.6}{}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.3.2}Roles}{8}{subsubsection.2.3.2}\protected@file@percent } +\newlabel{roles}{{2.3.2}{8}{Roles}{subsubsection.2.3.2}{}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.3.3}Association to Fossil Fuel Industry}{8}{subsubsection.2.3.3}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Participants with fossil fuel industry association (COP)\relax }}{9}{figure.caption.7}\protected@file@percent } \newlabel{fig:cop_fossil}{{8}{9}{Participants with fossil fuel industry association (COP)\relax }{figure.caption.7}{}} \@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces Participants with fossil fuel industry association (SB)\relax }}{9}{figure.caption.7}\protected@file@percent } \newlabel{fig:sb_fossil}{{9}{9}{Participants with fossil fuel industry association (SB)\relax }{figure.caption.7}{}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.3.4}Experience}{9}{subsubsection.2.3.4}\protected@file@percent } +\newlabel{experience}{{2.3.4}{9}{Experience}{subsubsection.2.3.4}{}} \@writefile{toc}{\contentsline {section}{\numberline {3}Predictive Modelling}{9}{section.3}\protected@file@percent } \newlabel{predictive_modelling}{{3}{9}{Predictive Modelling}{section.3}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Predict Interventions}{9}{subsection.3.1}\protected@file@percent } -\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}Models}{9}{subsubsection.3.1.1}\protected@file@percent } -\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.2}Results}{9}{subsubsection.3.1.2}\protected@file@percent } -\@writefile{toc}{\contentsline {section}{\numberline {4}Conclusion}{9}{section.4}\protected@file@percent } +\@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces Average Experience Score over time\relax }}{10}{figure.caption.8}\protected@file@percent } +\newlabel{fig:expscore_overview}{{10}{10}{Average Experience Score over time\relax }{figure.caption.8}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces Distribution of the intervention labels\relax }}{10}{figure.caption.9}\protected@file@percent } +\newlabel{fig:interv_distr}{{11}{10}{Distribution of the intervention labels\relax }{figure.caption.9}{}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}Data samples}{10}{subsubsection.3.1.1}\protected@file@percent } +\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.2}Models}{11}{subsubsection.3.1.2}\protected@file@percent } +\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.3}Results}{11}{subsubsection.3.1.3}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {4}Conclusion}{11}{section.4}\protected@file@percent } \bibdata{reference} \bibcite{pdfminer.six}{1} \bibcite{pytesseract}{2} \bibcite{UNFCCC}{3} \bibcite{ipcc:2018}{4} \bibcite{evolution_UNFCCC}{5} \bibcite{tesseract_expl}{6} \bibcite{UNFCCC_docs}{7} \bibcite{UNFCCC_process}{8} \bibcite{UNFCCC_genderreport}{9} \bibcite{coco}{10} \bibstyle{plain} -\@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces Average Experience Score over time\relax }}{10}{figure.caption.8}\protected@file@percent } -\newlabel{fig:expscore_overview}{{10}{10}{Average Experience Score over time\relax }{figure.caption.8}{}} -\gdef \@abspage@last{12} +\gdef \@abspage@last{14} diff --git a/report/report.tex b/report/report.tex index 7bd08d2..d3886a8 100644 --- a/report/report.tex +++ b/report/report.tex @@ -1,46 +1,47 @@ \documentclass{article} \usepackage{graphicx} \graphicspath{ {./images/} } \usepackage{caption} \usepackage{hyperref} \usepackage{glossaries} \usepackage{amsmath} +\usepackage{amsfonts} \usepackage{cite} %page margins \usepackage{geometry} \geometry{ a4paper, total={170mm,257mm}, left=20mm, top=20mm, } \title{Negotiators and Lobbyists: Understanding the Composition of Delegations to International Climate Negotiations} \author{Jan Linder} \date{15.01.2021} \makeglossaries \loadglsentries{glossary} \begin{document} %\maketitle \input{titlepage} \tableofcontents \newpage \input{introduction} \input{data_processing} \input{predictive_modelling} \input{conclusion} \newpage \printglossaries \bibliography{reference}{} \bibliographystyle{plain} \newpage \listoffigures \end{document} \ No newline at end of file