diff --git a/code/scripts/predict_interventions.ipynb b/code/scripts/predict_interventions.ipynb index 04ddb84..9107d93 100644 --- a/code/scripts/predict_interventions.ipynb +++ b/code/scripts/predict_interventions.ipynb @@ -1,628 +1,522 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Predictive modelling of interventions\n", "## Ridge regression\n", "\n", "This notebook is ................\n", "\n", "\n", "First of all, we import the necessary packets." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn import linear_model\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.metrics import accuracy_score\n", "\n", "# constants\n", "FIRST_COUNTRY_INDEX = 11" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Prepare the dataset\n", "The data is provided in pandas dataframes. If the necessary csv files are not available, they can be generated with the script 'prepare_intervention_data.py'. This data now needs to be converted into numpy array such that we can train our model." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9217\n", "9217\n", "6257\n", "['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo Republic', 'Cook Islands', 'Costa Rica', \"Cote d'Ivoire\", 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'DR Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'Kuwait', 'Kyrgyz Republic', 'Laos', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macedonia', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius', 'Mexico', 'Micronesia, Fed. Sts.', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'North Korea', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestine', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'Rwanda', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Korea', 'South Sudan', 'Spain', 'Sri Lanka', 'St. Kitts and Nevis', 'St. Lucia', 'St. Vincent and the Grenadines', 'Sudan', 'Suriname', 'Sweden', 'Switzerland', 'Syria', 'Tajikistan', 'Tanzania', 'Thailand', 'Timor-Leste', 'Togo', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Vatican', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe']\n" ] } ], "source": [ "data = pd.read_csv(\"../data/data_regression/dataset_interventions.csv\",\n", " encoding=\"utf-8-sig\")\n", "\n", "D = 209\n", "N = len(data)\n", "print(N)\n", "dataset = np.zeros((N, D), dtype=np.float64)\n", "\n", "dataset[:,:11] = (data.loc[:,\"nb_delegates\":\"experience score\"]).to_numpy()\n", "\n", "labelset = np.zeros((N,), dtype=np.float64)\n", "labelset[:] = (data.loc[:, \"nb_interventions\"]).to_numpy()\n", "\n", "print(len(data.loc[data[\"nb_interventions\"]]))\n", "print(len(data.loc[data[\"nb_interventions\"] == 0]))\n", "\n", "# read the valid countries\n", "country_file = open(\"../data/dictionaries/valid_countries.txt\", \"r\")\n", "countries = country_file.readlines()\n", "countries = [c.replace(\"\\n\", \"\") for c in countries]\n", "countries = sorted(countries)\n", "print(countries)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The resting part of the dataset is the country. We need to write a function that returns the index of the country in a sorted list of the valid countries." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def get_country_index(country):\n", " if country in countries:\n", " return countries.index(country)\n", " else:\n", " # unknown country\n", " return len(countries)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To finalize the dataset, we use the defined function. for indices 12 to 209, a 1 means that the affiliation is this country, 0 means that it isn't." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "for i in range(N):\n", " country = data.iloc[i,1]\n", " dataset[i, FIRST_COUNTRY_INDEX + get_country_index(country)] = 1\n", "\n", "# TODO maybe verify that all rates are correct" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Split the data to training data and test data\n", "In a first step, I consider the first 80% of the samples as training data and the resting 20% as test data" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# shuffle everything\n", + "np.random.seed(2020)\n", "p = np.random.permutation(N)\n", "shuffled_dataset = dataset[p]\n", "shuffled_labelset = labelset[p]\n", "\n", "SPLIT_IDX = 7400\n", "# seperate train and test data\n", "X_train = shuffled_dataset[:SPLIT_IDX]\n", "Y_train = shuffled_labelset[:SPLIT_IDX]\n", "\n", "X_test = shuffled_dataset[SPLIT_IDX:]\n", "Y_test = shuffled_labelset[SPLIT_IDX:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Baseline models\n", "To have a upper bound for the performance of our models, we introduce two baseline models. The first one consists in predicting always 0 interventions." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Mean squared error: 69.25\n" + "Mean squared error: 91.79\n" ] } ], "source": [ "n = Y_test.shape\n", "test_predict_baseline_zero = np.zeros(n)\n", "baseline_zero_mse = mean_squared_error(Y_test, test_predict_baseline_zero)\n", "print('Mean squared error: %.2f'\n", " % baseline_zero_mse)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another baseline is to predict for each country the average number of interventions done in the training data meetings." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Mean squared error: 21.48\n" + "Mean squared error: 29.33\n" ] } ], "source": [ "test_predict_baseline_avg = np.zeros(n)\n", "\n", "# fill in a list with the averages\n", "intervention_averages = []\n", " \n", "# predict accordingly\n", "for i in range(len(countries)):\n", " index = FIRST_COUNTRY_INDEX + i\n", " train_samples_this_country = (Y_train[X_train[:,index] == 1])\n", " avg = np.sum(train_samples_this_country) / len(train_samples_this_country)\n", " intervention_averages.append(avg)\n", " \n", " test_predict_baseline_avg[X_test[:,index] == 1] = avg\n", " \n", "baseline_avg_mse = mean_squared_error(Y_test, test_predict_baseline_avg)\n", "print('Mean squared error: %.2f'\n", " % baseline_avg_mse)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As expected, this result is equal to a simple linear model without global bias that only works on the country data." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Mean squared error: 21.48\n" + "Mean squared error: 29.33\n" ] } ], "source": [ "# baseline data\n", "X_train_baseline = X_train[:,FIRST_COUNTRY_INDEX:]\n", "X_test_baseline = X_test[:,FIRST_COUNTRY_INDEX:]\n", "\n", "reg = linear_model.LinearRegression(fit_intercept=False)\n", "reg.fit(X_train_baseline, Y_train)\n", "test_predict = reg.predict(X_test_baseline)\n", "intervention_mse = mean_squared_error(Y_test, test_predict)\n", "print('Mean squared error: %.2f'\n", " % intervention_mse)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Ridge regression on all the data without experience\n", "Now, we can train the actual first model, conventional ridge regression that expects a gaussion distribution. We first use crossvalidation to determine the best regularizer." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The best regularizer is lambda = 0.5890837159618431\n" - ] - } - ], + "outputs": [], "source": [ "# cross validation to determine regularizer\n", "reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 10000))\n", "reg.fit(X_train, Y_train)\n", "lambda_ = reg.alpha_\n", "print(f\"The best regularizer is lambda = {lambda_}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, we train the model with the optimal lambda." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.3385376474101482\n", - "[ 6.85240909e-03 -3.20908916e-01 7.19978068e-01 2.68566882e-01\n", - " -3.20810152e-01 -1.33914568e+00 -4.50412452e-01 1.12182311e+00\n", - " -5.10403131e-02 -5.71016667e-01 0.00000000e+00 -1.54640712e+00\n", - " -1.58373086e+00 -8.44184014e-01 -1.62938161e+00 -1.77281249e+00\n", - " -1.35089637e+00 2.96002098e+00 -1.72109953e+00 1.53857720e+01\n", - " -1.49745941e+00 -1.79927782e+00 -1.47492860e+00 -1.57728694e+00\n", - " -4.07399039e-01 -1.06586231e+00 -6.35340818e-01 -1.78078640e+00\n", - " -1.54400951e+00 -1.29802655e+00 -1.47731161e+00 3.65013164e+00\n", - " -1.50788785e+00 -1.01995743e+00 7.71383983e+00 -1.71188272e+00\n", - " -1.15777947e+00 -1.17883178e+00 -1.61533889e+00 -1.87183358e+00\n", - " -1.68619114e+00 -1.71191925e+00 1.54602173e+01 -1.58281342e+00\n", - " -1.75435129e+00 -4.73363904e-01 5.07426029e+01 1.63701839e+00\n", - " -1.95646111e+00 -1.96737187e+00 -1.21790139e+00 -5.82975927e-01\n", - " -2.08753550e+00 -1.23790569e+00 -1.12780401e+00 -1.83938139e+00\n", - " -1.69029345e+00 -2.04845668e+00 -1.63763332e+00 -1.88243077e+00\n", - " -1.58765374e+00 -1.67678413e+00 -8.43039778e-01 6.31110777e-01\n", - " -1.29233058e+00 -1.79932379e+00 -1.83662221e+00 -1.58551045e+00\n", - " -1.57128569e+00 -1.39447746e+00 -1.60248904e+00 -1.38667570e+00\n", - " -8.86680530e-01 -1.71427250e+00 -4.48110528e-01 -1.65630464e+00\n", - " -1.10218361e+00 -6.00103375e-01 -1.64688643e+00 -5.10108753e-01\n", - " -1.18733894e+00 -1.93202297e+00 -1.97728188e+00 -1.27133052e+00\n", - " -1.71841149e+00 -1.63112390e+00 -1.30119875e+00 -6.69116873e-01\n", - " 6.82097567e+00 2.84553797e-01 4.61503190e-01 -1.44883003e+00\n", - " -1.55178399e+00 -1.54793647e+00 -1.37559820e+00 -1.03083680e+00\n", - " 1.66143331e+01 -1.65492431e+00 -8.31736034e-01 -5.77688136e-01\n", - " -1.38542616e+00 1.22786971e+00 -1.58983478e+00 -1.83924807e+00\n", - " -1.41684587e+00 -1.46160206e+00 -1.79683391e+00 -1.84732341e+00\n", - " -1.72063582e+00 -1.96970250e+00 -1.65789524e+00 -1.73748844e+00\n", - " -1.56001557e+00 -1.80981567e+00 -1.42185420e+00 1.23719538e+00\n", - " -1.42331189e+00 -1.46773545e+00 -1.57958994e+00 5.04132609e-01\n", - " -1.55206972e+00 -1.57585103e+00 7.16594390e-01 -4.00032178e-01\n", - " -1.81469724e+00 -1.95477724e+00 -1.57959081e+00 -1.63515492e+00\n", - " -1.16235326e+00 -1.55235401e+00 -1.74356489e+00 -1.41233591e+00\n", - " -1.70317058e+00 -1.30534285e+00 -9.99458276e-01 7.90167585e+00\n", - " -8.20464306e-01 -2.03173850e+00 -3.31250326e-01 -1.59555413e+00\n", - " -1.93307824e+00 8.08588592e+00 -1.26734493e+00 -2.79415608e-01\n", - " -1.73260787e+00 -1.62520780e+00 -1.17008551e+00 4.92869072e-01\n", - " -1.41733667e+00 -2.76602855e-01 3.03666241e+00 -7.82965235e-01\n", - " -1.56442269e+00 -1.23626608e+00 -1.41902003e+00 4.64456077e+00\n", - " -1.84042516e+00 9.43701875e-01 -2.05905057e+00 -1.90021262e+00\n", - " 1.28704579e+01 -8.19286674e-01 -1.66227107e+00 -1.81793214e+00\n", - " -1.65300334e+00 -2.37847419e-01 -1.56401073e+00 -9.20795306e-01\n", - " -1.65453139e+00 -1.70933752e+00 3.81884269e+00 6.92833870e-01\n", - " -1.68889980e+00 -9.21905573e-01 -1.47816932e+00 -1.57132651e+00\n", - " -1.12971325e+00 -1.63127956e+00 -1.21101926e+00 -1.47358363e+00\n", - " -1.33980538e+00 9.02580075e+00 -1.34701882e+00 -1.86973033e+00\n", - " -1.54370596e-03 -4.99064453e-01 -1.50687737e+00 -1.55122770e+00\n", - " -1.75855521e+00 -1.10754833e+00 -1.84102042e+00 -9.31742302e-01\n", - " -1.83078678e+00 3.90929855e+00 -8.48693465e-02 -9.08614069e-01\n", - " -1.11752161e+00 -2.08499814e+00 5.37254593e+01 -3.35837699e-01\n", - " -1.39059764e+00 -1.91156035e+00 -1.56136347e+00 2.16808631e+00\n", - " -1.64821370e+00 -2.02838055e+00 -1.17905601e+00 -6.07615370e-01\n", - " -2.01407971e+00]\n" - ] - } - ], + "outputs": [], "source": [ "reg = linear_model.Ridge(alpha=lambda_)\n", "reg.fit(X_train, Y_train)\n", "w0 = reg.intercept_\n", "W = reg.coef_\n", "print(w0)\n", "print(W)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-0.41206791 0.56000727 0.17317352 ... 0.02251548 0.25230943\n", - " 19.3002127 ]\n", - "Mean squared error: 21.53\n" - ] - } - ], + "outputs": [], "source": [ "test_predict = reg.predict(X_test)\n", "print(test_predict)\n", "print('Mean squared error: %.2f'\n", " % mean_squared_error(Y_test, test_predict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As we notice, the mean square error is basically equal to the one in the baseline model. The additional dimensions do not help to improve the model." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Linear Model with logarithmic transformation\n", "The reason for the failure of the normal linear model is that our Y doesn't follow a Gaussian distribution, but a logarithmic. We thus try the same model but with a transformed Y. We transform the y's accordingly: \n", "$y' = log(c + y)$ with $c > 0$.\n", "We try different values for c. First, c = 1 (which preserves y = 0 to be y' = 0)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "c = 0.05\n", - "Mean squared error : 15.27\n", - "c = 0.1\n", - "Mean squared error : 13.74\n", - "c = 0.15\n", - "Mean squared error : 13.25\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;31m# crossvalidation for lambda\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0mreg\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlinear_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRidgeCV\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0malphas\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlogspace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m6\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m6\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1000\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mreg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY_train_transf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[0mlambda_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0malpha_\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mtest_predict_transf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_ridge.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 1600\u001b[0m \u001b[0mstore_cv_values\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstore_cv_values\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1601\u001b[0m is_clf=is_classifier(self))\n\u001b[1;32m-> 1602\u001b[1;33m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msample_weight\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1603\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0malpha_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0malpha_\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1604\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbest_score_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbest_score_\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_ridge.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 1491\u001b[0m \u001b[0msqrt_sw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mones\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn_samples\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1492\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1493\u001b[1;33m \u001b[0mX_mean\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0mdecomposition\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdecompose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msqrt_sw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1494\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1495\u001b[0m \u001b[0mscorer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_scoring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mscoring\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mallow_none\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Miniconda3\\lib\\site-packages\\sklearn\\linear_model\\_ridge.py\u001b[0m in \u001b[0;36m_svd_decompose_design_matrix\u001b[1;34m(self, X, y, sqrt_sw)\u001b[0m\n\u001b[0;32m 1410\u001b[0m \u001b[0mU\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msingvals\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlinalg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msvd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfull_matrices\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1411\u001b[0m \u001b[0msingvals_sq\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msingvals\u001b[0m \u001b[1;33m**\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1412\u001b[1;33m \u001b[0mUT_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mU\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mT\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1413\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mX_mean\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msingvals_sq\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mU\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mUT_y\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1414\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m<__array_function__ internals>\u001b[0m in \u001b[0;36mdot\u001b[1;34m(*args, **kwargs)\u001b[0m\n", - "\u001b[1;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], + "outputs": [], "source": [ "cs = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.5, 1]\n", "\n", "for c in cs:\n", " # logarithmic transformation\n", " Y_train_transf = np.log(c + Y_train)\n", " Y_test_transf = np.log(c + Y_test)\n", "\n", " # crossvalidation for lambda\n", " reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 1000))\n", " reg.fit(X_train, Y_train_transf)\n", " lambda_ = reg.alpha_\n", " test_predict_transf = reg.predict(X_test)\n", " # transform the output back\n", " test_predict = np.exp(test_predict_transf) - c\n", "\n", " print(f\"c = {c}\")\n", " print('Mean squared error : %.2f'\n", " % mean_squared_error(Y_test_transf, test_predict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To compare, we do the baseline data (only countries) with the optimal c that we found" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean squared error : 11.85\n" ] } ], "source": [ "c = 0.2 # the optimal found above\n", "\n", "# logarithmic transformation\n", "Y_train_transf = np.log(c + Y_train)\n", "Y_test_transf = np.log(c + Y_test)\n", "\n", "# crossvalidation for lambda\n", "reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 1000))\n", "reg.fit(X_train_baseline, Y_train_transf)\n", "lambda_ = reg.alpha_\n", "test_predict_transf = reg.predict(X_test_baseline)\n", "# transform the output back\n", "test_predict = np.exp(test_predict_transf) - c\n", "\n", "print('Mean squared error : %.2f'\n", " % mean_squared_error(Y_test_transf, test_predict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Combine multiple models\n", "Even with the logistic transformation, our data doesn't follow a Gaussian distribution. One problem is that there are a lot of samples with value 0. Several papers suggest for situations like that to first perform a classifying task that decides whether a sample is 0 or not, and then apply a second model. (e.g. https://www.kent.ac.uk/smsas/personal/msr/webfiles/zip/ibc_fin.pdf) As we work with count data, Poisson distribution may fit our data better.\\\n", "Hence, we first apply logistic regression to classify into 0 and non 0." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[False False False ... False False True]\n", "[False True False ... False False False]\n", "0.7749036873968079\n" ] } ], "source": [ "Y_train_class = Y_train > 0\n", "Y_test_class = Y_test > 0\n", "\n", "clf = linear_model.LogisticRegression(max_iter=1000)\n", "clf.fit(X_train, Y_train_class)\n", "predict_class = clf.predict(X_test)\n", "print(predict_class)\n", "print(Y_test_class)\n", "accuracy = accuracy_score(Y_test_class, predict_class)\n", "print(accuracy)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(Note: only 78% accuracy on training data.)\\\n", "Now, on the data that is not zero, we apply a Poission regressor." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean squared error on Poisson data : 3.11\n", "Mean squared error : 63.92\n" ] } ], "source": [ "\"\"\"X_train_poiss = X_train[clf.predict(X_train) == 1]\n", "Y_train_poiss = Y_train[clf.predict(X_train) == 1]\n", "X_test_poiss = X_test[clf.predict(X_test) == 1]\n", "Y_test_poiss = Y_test[clf.predict(X_test) == 1]\"\"\"\n", "X_train_poiss = X_train[Y_train_class == 1]\n", "Y_train_poiss = Y_train[Y_train_class == 1]\n", "X_test_poiss = X_test[clf.predict(X_test) == 1]\n", "Y_test_poiss = Y_test[clf.predict(X_test) == 1]\n", "\n", "# log transformation\n", "Y_train_poiss = np.log(c + Y_train_poiss)\n", "Y_test_poiss = np.log(c + Y_test_poiss)\n", "\n", "\n", "reg = linear_model.PoissonRegressor(alpha=0.1, max_iter=1000)\n", "# reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 1000))\n", "reg.fit(X_train_poiss, Y_train_poiss)\n", "predict_poiss = reg.predict(X_test_poiss)\n", "\n", "print('Mean squared error on Poisson data : %.2f'\n", " % mean_squared_error(Y_test_poiss, predict_poiss))\n", "\n", "# on everything\n", "predict = np.zeros(Y_test.shape)\n", "predict[clf.predict(X_test) == 1] = predict_poiss\n", "\n", "print('Mean squared error : %.2f'\n", " % mean_squared_error(Y_test, predict))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/report/data_processing.tex b/report/data_processing.tex index 802a0d5..90d00fd 100644 --- a/report/data_processing.tex +++ b/report/data_processing.tex @@ -1,384 +1,381 @@ \chapter{Data Extraction and Processing} The main part of this project is to extract the information contained in the participant lists of UNFCCC meetings. We explain in this chapter how we extracted and processed the data of the PDF lists. \section{Data Extraction} -This section describes task one of the project. +This section describes task one of the project, i.e., how we extract the data from the PDF participant lists. The first step consists of transforming the available PDF files into text files. It is important to keep some information about the structure of the text to be able to find the relevant data in the resulting text file. \\ The second step consists of transforming the text files to comma-separated values (CSV) files. The result of this task is a CSV file for each processed participant list that contains the entries \textit{affiliation category, affiliation, name, description}. \subsection{Raw Dataset} \label{dataset} We download the participant lists from the document webpage of the UNFCCC secretariat. \cite{UNFCCC_docs} -The analyzed lists contain all the COP meetings and almost all the SB meetings. Note that during a COP, there is always an SB meeting +The lists we process contain all the COP meetings and almost all the SB meetings. Note that during a COP, there is mostly an SB meeting held in parallel for which there is no separate participant list. \\ A participant list has the following general structure: Participants are listed under the affiliation they belong to. A member of the Swiss government is for example listed under the affiliation “Switzerland”. A participant is attributed a salutary address that contains at least "Mr." or "Ms.", but may also contain some titles as "H.E." (i.e. "Her Excellence"). Some participants, but not all of them, are attributed a description that explains their role within the delegation. This description could for example be "Minister of Foreign Affairs". Affiliations are sorted according to their affiliation category and then alphabetically. The possible affiliation categories are: \begin{itemize} \item Parties \item Observer States \item United Nations Secretariat units and bodies \item Specialized agencies and related organizations \item Intergovernmental organizations \item Non-governmental organizations \end{itemize} The category "Media" exists for newer participant lists, but the corresponding participants are not listed. We therefore exclude this category. \\ -The format of the participant lists varies over time. For the first meetings the participant lists are paper scans, what means that +The format of the participant lists varies over time. For the first meetings the participant lists are paper scans, which means that we need to convert images to text. Furthermore, the manner in which affiliations are indicated varies, in the first meetings they are always written in all uppercase letters, which was changed in later meetings. \\ We choose the version of the participant lists that is published during the last days of a meeting. We exclude the corrigenda, documents that are published later for some participant lists and contain corrections of the lists, because their format varies a lot and many of the listed corrections are rather small (change of order of participants within an affiliation, change of descriptions). \subsection{Optical Character Recognition} -To extract the data from the scanned lists, we use Optical Character Recognition (OCR), more precisely Python-tesseract (pytesseract). -\cite{pytesseract} Python-Tesseract is a wrapper for the OCR engine Tesseract that is developed by Google since 2006, open-source and -available under the Apache 2.0 license. \\ -% TODO first check what options I finally use, then describe how tesseract works. -Tesseract works as follows. As a first step, it performs a connected component analysis that finds blobs that are organized to text lines. -Then, a two-pass process for recognition is applied. +To extract the data from the scanned lists, we use Optical Character Recognition (OCR), more precisely Python-tesseract (pytesseract \cite{pytesseract}). +Python-Tesseract is a wrapper for the OCR engine Tesseract developed by Google since 2006. \\ +Tesseract works as follows. First, it performs a connected component analysis to find blobs that are organized as text lines. +Then, a two-pass process for recognition is applied. % TODO connected component analysis In the first pass, the program tries to recognize each word. If a word is recognized satisfactory, it is used as training data for every word that follows. To make use of all the training data, the second pass goes over all unrecognized words for a second time. \cite{tesseract_expl} \\ The version of tesseract that we use introduces neural nets LSTM. % TODO LSTM neural nets In the dataset of this project, the Tesseract OCR engine fails for some specific pages that contain only sparsely distributed participants without descriptions. To improve the performance of the connected component analysis, we insert half-transparent boxes on pages -that encounter this problem. (See figure \ref{fig:boxes}) This ensures the correct order of names in the resulting text file. +that encounter this problem. (See Figure \ref{fig:boxes}) This ensures the correct order of names in the resulting text file. \begin{figure}[ht] \caption{Page with an inserted half-transparent box before OCR} \centering -\includegraphics[width=0.3\textwidth]{boxes_tesseract.png} +\includegraphics[width=0.4\textwidth]{boxes_tesseract.png} \label{fig:boxes} -\end{figure} +\end{figure} % TODO bigger % TODO change title \subsection{Well-formatted PDF Extraction} To extract the data from the well-formatted PDF files, we use a PDF processing package called Pdfminer.six.\cite{pdfminer.six} -This python package is community-maintained and open-source. Again, the main difficulty is to extract the text of the list in correct order. Especially for documents with three columns, this becomes a difficult task. For this reason, we adapted the use of Pdfminer.six by rewriting one of the classes, the \texttt{PDFPageAggregator}. \\ First, we explain quickly how pdfminer extracts text from PDF files. Pdfminer.six perform a layout analysis on every page before extracting the text. This analysis is done in three stages: \begin{itemize} \item Group characters to words and lines \item Group lines to boxes \item Group textboxes hierarchically \end{itemize} -The output of the layout analysis is visualized in figure \ref{fig:pdfminer}.\\ +The output of the layout analysis is visualized in Figure \ref{fig:pdfminer}.\\ \begin{figure}[ht] \caption{Output of the layout analysis of pdfminer.six} \centering \includegraphics[width=0.9\textwidth]{pdfminer.png} \label{fig:pdfminer} \end{figure} The class we want to modify, \texttt{PDFPageAggregator}, is responsible for outputing the text lines of a page in the determined order. To be able to sort the text lines according to our rules later, we modify the function \texttt{receive\_layout} such that it outputs -for each LTTextLine the available x and y positions within the page. In our script that performs the extraction, we then define rules to +for each LTTextLine the available $x$ and $y$ positions within the page. In our script that performs the extraction, we then define rules to determine in which column a text line is situated. \\ A special case for the page layout are affiliation category titles. They break the column system in the middle of a page. We therefore need to recognize them by their content and introduce special rules for pages that contain affiliation category titles.\\ Another difficulty is the recognition of new affiliations. Pdfminer.six is not able to get information about font style, so the only way to detect new affiliations is through line breaks and the fact that names are always started with a salutary address. As line breaks are automatically preserved with pdfminer, we encounter problems only in special situations: When a new affiliation is on top of a column and longer than two lines, we can't distinguish it from the description of a previous participant that is divided to two columns. -\subsection{Extraction from text files} +\subsection{Extraction from Text Files} We now need to extract the information from the generated text files. We do this with the following procedure: \begin{enumerate} \item Clean the text file from unnecessary elements, e.g. page numbers and page headers. \item Iterate through the rows of the text file and repeatedly apply: \begin{enumerate} \item Check if the current line is the beginning of a new affiliation category. We do this by keyword checking. \item Check if the current line is a new affiliation. We look for format cues like a row in uppercase letters (for early meetings) or lines that are after a double line break and don't start with a salutary address. \item Check if the current line is a new name for the current affiliation by detecting a salutary address. \item If none of the above is the case, add the line to the description of the current participant. \end{enumerate} \end{enumerate} Note that this algorithm fails for participants that do not start with a salutary address. But as this case only happens a few times in all the processed lists, we can neglect this error. \section{Data Processing} -This section describes task two of the project. +This section describes task two of the project, i.e., to gain more information from the extracted data. % TODO rewrite The goal is to post-process the CSV files and bring all the meetings together to one dataset that contains more attributes per participants. -\subsection{Unification of meetings} -% country names: translation and country converter +\subsection{Unification of Meetings} In order to make our complete dataset as consistent as possible, we need the same affiliation to be named the same -throughout all the meetings. \\ +throughout all the meetings. For some earlier meetings, e.g. COP 2, the English version of the participant lists were not available. We therefore processed the French versions of their participant lists. With the help of a dictionary, we translate -the names of all the parties to English. \\ +the names of all the parties to English. Once all the country names are in English, we nevertheless need to unify them to the same country denotation. For example, the party Venezuela is named "Venezuela" in the participant list of COP 6, but "Venezuela (Bolivarian Republic of)" in COP 25. To unify the English country names, we use the python package country-converter. \cite{coco} We use it to change every country name to its "short" name. "Venezuela (Bolivarian Republic of)" then becomes "Venezuela". This packet has some limits when misspelling occur. For example, an error in the OCR process caused Iran to be spelled "Tran" in COP 1, which provokes that the country-converter doesn't recognize it correctly. \\ Note that we applied translation and unification only to the parties. Even if in some earlier meetings there are other affiliations that are written in different ways, the unification would have been more difficult and more error-prone due to the larger number of possible names. We therefore decided not to apply unification for the rest of the participant lists, also including the descriptions. -\subsection{Gender and title} +\subsection{Gender and Title} The easiest additional attributes to extract are gender and title of participants. This is due to the very static structure of names in the UNFCCC participant lists: Each name starts with a salutary address ("Mr.", "Ms.", "Sr.", "Sra." etc.) that is associated to be either male or female. By simply checking this salutary address, we can extract the gender -of each participant. \\ +of each participant. Optionally, the salutary address contains some title like "H.E." ("Her Excellence"), "Dr." or "Prof.". We set a binary attribute \textit{has\_title} to 1 if a participant is listed with such a title, and to 0 otherwise. \subsection{Roles} The descriptions of the participants contain more information about the participant, but in a very inconsistent format. Every affiliation can decide what to provide as descriptions of their participants. -We try to extract information out of the descriptions by defining roles. These roles should try to define the +We extract information out of the descriptions by defining roles. These roles define the role of a participant within its affiliation. \\ We assign a role to a participant by looking for keywords in its description. The following list contains the -roles that we look for and some corresponding keywords decreasing priority. If a description contains +roles that we look for and some corresponding keywords in order of decreasing priority. If a description contains keywords from more than one role, it's assigned the one with higher priority. \begin{itemize} \item Security (Security Officer, Security Service) \item Diplomacy (Ambassador, Embassy, Diplomatic) \item Government (Ministry, Minister, Government, Parliament, Agency, Department of, European Commission, Presidential Office) \item Press (Journalist, Reporter, Radio, Press) \item Universities (Professor, Researcher, Student, University) \end{itemize} The reason for "Security" having the highest priority is that security service is often provided to people of other roles. With our priority rule, the description "Security Officer of the Minister" would be assigned the role "Security" and not "Government", which is the correct choice. On the other hand, we avoid the keyword "Security" for this role to prevent a description like "Minister for Politics, Law, and Security Affairs" being assigned to the role "Security". \\ -\subsection{Association to fossil fuel industry} +\subsection{Association to Fossil Fuel Industry} We also use keywords to determine whether a participant is associated to the fossil fuel industry or not. We separate this from the roles as we do not only use the description, but also the affiliation name to check for the keywords. For example, we want to detect all participants of the NGO -"canadian association of petroleum producers" as associated to the fossil fuel industry, even if +"Canadian Association of Petroleum Producers" as associated to the fossil fuel industry, even if they don't have any description. \\ It is also an advantage that a participant that is associated with the fossil fuel industry can still have a role. For example, Saudi Arabia has a Ministry of Petroleum. The corresponding Minister is assigned the role "Government" but still is associated with the fossil fuel industry. \subsection{Experience} When bringing together the data of all the different meetings, we are interested in the experience of participants. We define experience by the number of earlier UNFCCC meetings that the participant has visited. We differ between experience in SB meetings and COP meetings, as they have quite different characteristics. Furthermore, we differ between experience within a delegation of a Party to the Convention (i.e. category "parties") and experience in a delegation of a non-governmental organization. \\ To determine the experience, we have to compare names throughout different meetings. There are some situations where a plain text comparison would fail, even if it's the same person. \begin{itemize} \item Different spellings of the name, simplification of a special character (e instead of é) \item Long names that span over more than one column are not entirely detected in the newer PDFs because there are three columns in the document. Hence, only a part of the name is detected. % TODO example \item The order of names is swapped (e.g. "Obama Barack" instead of "Barack Obama") \end{itemize} We decided to handle these cases in the following manner: \begin{itemize} - \item Allow an Edit distance of 1. (explanation below) + \item Allow an edit distance of 1 (see below). \item Consider two names as the same when one starts with the other ("Alexander Van der Bellen" and "Alexander Van der" are considered to be the same person). We exclude names with - a low number of characters (\textless 15) from this rule to guarantee that a line break is involved. + less than 15 characters from this rule to guarantee that a line break is involved. \item If the set of words of two names are equal, the persons are considered to be the same. \end{itemize} -We decide to compute the \textbf{Edit distance} between names. There exist several types of Edit distances. -All of them share the fact that they count the minimum number of operations to get from one string to the other. +We compute the \textbf{edit distance} between names. There exist several types of edit distances. +All of them count the minimum number of operations to get from one string to the other. We need to keep the accepted distance very small to keep the error rate low. With over 130 000 distinct participants, the occurrence of very similar names is probable. To get the property that we want, we need substitution to be allowed, such that a missed special character -or a typo can simply be replaced by the correct character. We compare the performance of two Edit distances. \\ +or a typo can simply be replaced by the correct character. We compare the performance of two edit distances. \\ The \textbf{Hamming distance} only allows substitution, hence the compared strings need to have the same length. It is equal to the number positions at which the symbols differ in the two strings. The \textbf{Levenshtein distance} allows substitution, insertion and deletion. It is equal to the minimum number of single-character edits required to change one string into the other. Mathematically, \begin{equation} \label{levenshtein} lev(a,b) = \begin{cases} \lvert a \rvert & \text{if } \lvert b \rvert = 0 \\ \lvert b \rvert & \text{if } \lvert a \rvert = 0 \\ lev(tail(a), tail(b)) & \text{if } a[0] = b[0] \\ 1 + min \begin{cases} lev(tail(a), b) \\ lev(a, tail(b)) \\ lev(tail(a), tail(b)) \\ \end{cases} & \text{otherwise} \end{cases} \end{equation} where for a string x, tail(x) is the string without the first character and $\lvert x \rvert$ is the length of the string. \\ -When comparing the results of Levenshtein distance and Hamming distance on our data, the links that were additionally -found by the Levenshtein distance were mostly correct ones. One common case is for example a forgotten apostrophe (e.g. +When comparing the results of Levenshtein distance and Hamming distance on our data, the samples that were additionally +found to be the same person by the Levenshtein distance were mostly correct ones. One common case is for example a forgotten apostrophe (e.g. "yaara peretz" and "ya'ara peretz") or a missing empty space (e.g. "yong chul cho" and "yongchul cho"). There are some false positives that are inserted, but this is rather due to common names (e.g. "yan jia" and "yuan jia"). According to the results of this comparison, we choose the Levenshtein distance in the final implementation. \\ To mark false positives, we add another attribute to the dataset that is set to one if for a participant, there has been detected his name twice in one of the earlier meetings. When this flag is set, the experience attributes contain an error. \\ In addition to the attributes for experience that we can add to our dataset, we obtain the information for each participant to which meetings they have participated within which affiliation. \\ Note that a delegation is one instance of an affiliation. Each affiliation comes to a new meeting with a new delegation. To be able to compare delegations with respect to the experience of their participants, we need to define a metric for the experience of a delegation. We call this the \textbf{experience score} of an affiliation and define it as follows: \begin{equation} ExperienceScore(\text{delegation}) = avg(\text{total experience of the top 10 most experienced participants}) \end{equation} The reason for only choosing the top 10 is that delegations are sometimes very big with only a few participants actively involved in the negotiation process. \section{Results} We process the participant list of 54 UNFCCC meetings, 26 COPs and 28 SBs. We find in total 271,434 entries. \\ In average, we find 8353 participants per COP meeting and 1949 participants per SB meeting. -The following figures \ref{fig:cop_overall} and \ref{fig:sb_overall} show the total numbers of extracted participants +We show in Figures \ref{fig:cop_overall} and \ref{fig:sb_overall} the total numbers of extracted participants of all the COP and SB meetings respectively. +% TODO comment figures! % TODO maybe add number of distinct affiliations (easy). Maybe add plots for some affiliations, some top 10 \begin{figure} \centering \begin{minipage}[ht]{.5\textwidth} \captionsetup{width=.8\linewidth} \captionof{figure}{Overview of the extracted participants of COP meetings} \centering \includegraphics[width=0.6\textwidth]{participants_per_cop.png} \label{fig:cop_overall} \end{minipage}% \begin{minipage}[ht]{.5\textwidth} \captionsetup{width=.8\linewidth} \captionof{figure}{Overview of the extracted participants of SB meetings} \centering \includegraphics[width=0.6\textwidth]{participants_per_sb.png} \label{fig:sb_overall} \end{minipage} \end{figure} \subsection{Gender and title} The proportion of women steadily increased since the first meetings. Starting at a rate of 21.4\% at the first SB meeting in 1995 it reached its temporary peak of 47.3\% at SB 50 in 2019. Figure \ref{fig:gender} shows the continuously increasing trend of this measure, with a slight higher rate of women at the SB meetings compared to the COP meetings. \begin{figure}[ht] \caption{Proportion of female participants per meeting} \centering \includegraphics[width=0.8\textwidth]{gender.png} \label{fig:gender} \end{figure} The UNFCCC secretariat publishes gender composition reports as their goal is to meet gender balance at their meetings, as this may lead to more gender-sensitive climate policies. They show in these reports that even if the numbers are almost reaching 50\% in the latest meetings, equality is not yet reached. The proportion of women is lower when only looking at the Parties to the Convention, and it is also significantly lower when considering the heads of delegations. \cite{UNFCCC_genderreport} \\ The number of participants with a title is generally rather slow. For COP meetings, the average rate of participants with a title is at 3.9\%, for SB meetings the average is at 1.8\%. \subsection{Roles} The assigned roles are mainly of interest for parties, as the descriptions are the most exhaustive for their delegates and -also contain more keywords. The figures \ref{fig:cop_roles} and \ref{fig:sb_roles} show which roles have been found to +also contain more keywords. The Figures \ref{fig:cop_roles} and \ref{fig:sb_roles} show which roles have been found to which percentage in the parties of the meetings for COP and SB meetings respectively. The main role is "Government". The role "no keyword found" in the plots shows the participants that did not match any keyword, the role "no description" contains the participants that didn't have a description in the source document. \begin{figure} \centering \begin{minipage}{.5\textwidth} \captionof{figure}{Assigned roles for COP meetings} \centering \includegraphics[width=1\linewidth]{roles_cop.png} \label{fig:cop_roles} \end{minipage}% \begin{minipage}{.5\textwidth} \captionof{figure}{Assigned roles for SB meetings} \centering \includegraphics[width=1\linewidth]{roles_sb.png} \label{fig:sb_roles} \end{minipage} \end{figure} \subsection{Association to fossil fuel industry} The number of detected participants that openly represent the fossil fuel industry varies a lot from meeting to meeting. Figures \ref{fig:cop_fossil} and \ref{fig:sb_fossil} show the absolute numbers of detected fossil fuel industry representants for COP and SB meetings respectively. The average rate of participants with a fossil fuel industry association is 1.7\% for COP meetings and 2.7\% for SB meetings. These rates have decreased over the years as the number of participants has increased. \begin{figure} \centering \begin{minipage}[ht]{.5\textwidth} \captionsetup{width=.8\linewidth} \captionof{figure}{Participants with fossil fuel industry association (COP)} \centering \includegraphics[width=1\linewidth]{ff_cop.png} \label{fig:cop_fossil} \end{minipage}% \begin{minipage}[ht]{.5\textwidth} \captionsetup{width=.8\linewidth} \captionof{figure}{Participants with fossil fuel industry association (SB)} \centering \includegraphics[width=1\linewidth]{ff_sb.png} \label{fig:sb_fossil} \end{minipage} \end{figure} \subsection{Experience} Over all meetings, we find 138,940 distinct participants. \\ We identify 193 persons that have participated to at least half of the 54 processed meetings. The most experienced participants and their affiliation in COP 25 are the following: \begin{enumerate} - \item Helmut Hojesky: Austria (26 COP, 27 SB) + \item Helmut Hojesky: Austria (26 COP, 27 SB) % TODO victor add more information? \item Norine Kennedy: United States Council for International Business (25 COP, 28 SB) \item Manfred Treber: Germanwatch (26 COP, 26 SB) \end{enumerate} % TODO flow of partipants TODO "Movements of participants" \\ % TODO experience score graphs (include austria!!) We consider our defined Experience Score to compare affiliations according to their experience. Figure \ref{fig:expscore_overview} shows the average Experience Score over all affiliations per meeting. The separation of the bars shows if the experience is more or less gained in COP or SB meetings. \begin{figure}[ht] \caption{Average Experience Score over time} \centering \includegraphics[width=1\textwidth]{experiencescore_overview.png} \label{fig:expscore_overview} \end{figure} TODO plot experience score for some affiliations (include Austria) \ No newline at end of file diff --git a/report/introduction.tex b/report/introduction.tex index 10cd59e..6e428dd 100644 --- a/report/introduction.tex +++ b/report/introduction.tex @@ -1,50 +1,49 @@ \chapter{Introduction} -This is the very first sentence in my report. Isn't that great? \section{International Climate Negotiations} For decades, anthropogenic climate change is scientific consent, as well as the urge to introduce more political measures to fight its principal cause, greenhouse gas emissions. In short, the Intergovernmental Panel on Climate Change (IPCC) states in its special report in 2018 that human activities have already caused a global warming of approximately 1 degree Celsius. To reduce the risks on natural and human systems the warming should optimally be limited to 1.5 degree Celsius, which would still need rapid and far-reaching transitions in many human-controlled systems. \cite{ipcc:2018} \\ The United Nations Framework Convention on Climate Change (UNFCCC) was opened for signature in 1992 at the UN Conference on Environment and Development in Rio de Janeiro. It entered into force in 1994 and is today signed by 196 countries and the European Union. The ultimate objective of the Convention is "the stabilization of greenhouse gas concentrations in the atmosphere at a level that would prevent dangerous anthropogenic interference with the climate system". \cite{UNFCCC} This goal is rather vague, but is to be understood as a political reaction to the first IPCC assessment report that was published in 1990. The first major agreement that resulted from the Convention was the Kyoto Protocol in 1997 that engages industrialized countries to measure and limit their greenhouse gas emissions according to individual targets. The second major agreement, the Paris Agreement, was made in 2015. It is a legally binding treaty that aims to limit global warming to well below 2, preferably to 1.5 degrees Celsius. \cite{UNFCCC_process} \cite{evolution_UNFCCC} The UNFCCC establishes different institutional arrangements for the negotiation process. The most impactful ones are the governing Supreme Bodies, to which belongs the Conference of the Parties (COP). At COP meetings, the Parties to the Convention meet to make decisions about the implementation of the Convention and other adopted legal instruments. Furthermore, there are Subsidiary Bodies (SB) that assist the governing bodies in their decision-making process. The Subsidiary Body for Scientific and Technological Advice (SBSTA) provides the latest research results on scientific and technological matters. The Subsidiary Body for Implementation (SBI) assists the governing bodies in questions related to the implementation of the Convention and the agreements. SB meetings are held -two times a year, once at the same time as the COPs. \cite{UNFCCC_process} -As there is less public attention at SB meetings, the actual negotiations are more important. \\ +twice a year, once at the same time as the COPs. \cite{UNFCCC_process} +As there is less public attention on SB meetings, the actual negotiations are more important. \\ Other bodies of the UNFCCC exist, e.g. the process management bodies and the secretariat, but are not relevant for our project. \section{Project} \subsection{Larger Project} \label{tatiana} % TODO how to cite Tatianas work This semester project is part of a bigger project that aims to study country delegation characteristics and patterns of international cooperation. -We collaborate with the political scientists Marlene Kammerer (university of Bern) and Paula Castro (ZHAW). \\ +We collaborate with the political scientists Marlene Kammerer (University of Bern) and Paula Castro (ZHAW). \\ In 2020, Victor Kristof and Tatiana Cogne processed data of the Earth Negotiation Bulletin. Their dataset include detailed summaries of international climate negotiation meetings organized by the UNFCCC. -They collect interventions and extract the interactions of countries and coalitions. +They collected interventions and extracted the interactions of countries and coalitions. +% TODO put more \subsection{Our project} For each negotiation meeting, the UNFCCC secretariat publishes a list of participants. These lists are PDF files and therefore -not easy to treat. Considering the high number of participants per meeting, we want to extract the information +not easy to treat. Considering the quantity of participants per meeting, we want to extract the information contained in the original participant lists and bring them in a more convenient format. More information about the dataset is provided in section \ref{dataset}. Afterwards, we would like to process this information to get as much data as possible. -The tasks could therefore be stated as follows: +The problems we try to solve could therefore be stated as follows: \begin{enumerate} - \item Extraction of the data in the participant lists (PDF files) and conversion to a convenient format. - \item Processing the whole dataset to get more insight. + \item Extract the data of the participant lists (PDF files) and convert it to a convenient format. + \item Process the whole dataset to get more insight. % TODO is a bit too vague + \item Create a predictive model for the existing intervention data (see section \ref{tatiana}). \end{enumerate} -As a third task, we try to create a predictive model for the existing intervention data (see section \ref{tatiana}). -Due to time constraints, we weren't yet able to put as much effort as needed into this task. -The progress of this third task is explained in chapter \ref{predictive_modelling}. \ No newline at end of file +Due to time constraints, we weren't yet able to put as much effort as needed into the last problem. \ No newline at end of file diff --git a/report/report.aux b/report/report.aux index 6b5da2c..844a455 100644 --- a/report/report.aux +++ b/report/report.aux @@ -1,106 +1,106 @@ \relax \providecommand\hyper@newdestlabel[2]{} \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined \global\let\oldcontentsline\contentsline \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} \global\let\oldnewlabel\newlabel \gdef\newlabel#1#2{\newlabelxx{#1}#2} \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} \AtEndDocument{\ifx\hyper@anchor\@undefined \let\contentsline\oldcontentsline \let\newlabel\oldnewlabel \fi} \fi} \global\let\hyper@last\relax \gdef\HyperFirstAtBeginDocument#1{#1} \providecommand\HyField@AuxAddToFields[1]{} \providecommand\HyField@AuxAddToCoFields[2]{} \providecommand\@newglossary[4]{} \@newglossary{main}{glg}{gls}{glo} \providecommand\@glsorder[1]{} \providecommand\@istfilename[1]{} \@istfilename{report.ist} \@glsorder{word} \citation{ipcc:2018} \citation{UNFCCC} \citation{UNFCCC_process} \citation{evolution_UNFCCC} \citation{UNFCCC_process} \@writefile{toc}{\contentsline {chapter}{\numberline {1}Introduction}{2}{chapter.1}\protected@file@percent } \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} \@writefile{toc}{\contentsline {section}{\numberline {1.1}International Climate Negotiations}{2}{section.1.1}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {1.2}Project}{2}{section.1.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {1.2.1}Larger Project}{2}{subsection.1.2.1}\protected@file@percent } \newlabel{tatiana}{{1.2.1}{2}{Larger Project}{subsection.1.2.1}{}} -\@writefile{toc}{\contentsline {subsection}{\numberline {1.2.2}Our project}{3}{subsection.1.2.2}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {1.2.2}Our project}{2}{subsection.1.2.2}\protected@file@percent } \citation{UNFCCC_docs} \citation{pytesseract} \citation{tesseract_expl} \@writefile{toc}{\contentsline {chapter}{\numberline {2}Data Extraction and Processing}{4}{chapter.2}\protected@file@percent } \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} \@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Extraction}{4}{section.2.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.1.1}Raw Dataset}{4}{subsection.2.1.1}\protected@file@percent } \newlabel{dataset}{{2.1.1}{4}{Raw Dataset}{subsection.2.1.1}{}} \citation{pdfminer.six} \@writefile{toc}{\contentsline {subsection}{\numberline {2.1.2}Optical Character Recognition}{5}{subsection.2.1.2}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {2.1}{\ignorespaces Page with an inserted half-transparent box before OCR\relax }}{5}{figure.caption.2}\protected@file@percent } \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} \newlabel{fig:boxes}{{2.1}{5}{Page with an inserted half-transparent box before OCR\relax }{figure.caption.2}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.1.3}Well-formatted PDF Extraction}{5}{subsection.2.1.3}\protected@file@percent } \citation{coco} \@writefile{lof}{\contentsline {figure}{\numberline {2.2}{\ignorespaces Output of the layout analysis of pdfminer.six\relax }}{6}{figure.caption.3}\protected@file@percent } \newlabel{fig:pdfminer}{{2.2}{6}{Output of the layout analysis of pdfminer.six\relax }{figure.caption.3}{}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.1.4}Extraction from text files}{6}{subsection.2.1.4}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {2.1.4}Extraction from Text Files}{6}{subsection.2.1.4}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {2.2}Data Processing}{6}{section.2.2}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Unification of meetings}{6}{subsection.2.2.1}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Gender and title}{7}{subsection.2.2.2}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Unification of Meetings}{6}{subsection.2.2.1}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Gender and Title}{7}{subsection.2.2.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Roles}{7}{subsection.2.2.3}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.4}Association to fossil fuel industry}{7}{subsection.2.2.4}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.4}Association to Fossil Fuel Industry}{7}{subsection.2.2.4}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.2.5}Experience}{7}{subsection.2.2.5}\protected@file@percent } \newlabel{levenshtein}{{2.1}{8}{Experience}{equation.2.2.1}{}} \@writefile{toc}{\contentsline {section}{\numberline {2.3}Results}{8}{section.2.3}\protected@file@percent } \citation{UNFCCC_genderreport} \@writefile{lof}{\contentsline {figure}{\numberline {2.3}{\ignorespaces Overview of the extracted participants of COP meetings\relax }}{9}{figure.caption.4}\protected@file@percent } \newlabel{fig:cop_overall}{{2.3}{9}{Overview of the extracted participants of COP meetings\relax }{figure.caption.4}{}} \@writefile{lof}{\contentsline {figure}{\numberline {2.4}{\ignorespaces Overview of the extracted participants of SB meetings\relax }}{9}{figure.caption.4}\protected@file@percent } \newlabel{fig:sb_overall}{{2.4}{9}{Overview of the extracted participants of SB meetings\relax }{figure.caption.4}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.3.1}Gender and title}{9}{subsection.2.3.1}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {2.5}{\ignorespaces Proportion of female participants per meeting\relax }}{9}{figure.caption.5}\protected@file@percent } \newlabel{fig:gender}{{2.5}{9}{Proportion of female participants per meeting\relax }{figure.caption.5}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.3.2}Roles}{9}{subsection.2.3.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.3.3}Association to fossil fuel industry}{9}{subsection.2.3.3}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {2.6}{\ignorespaces Assigned roles for COP meetings\relax }}{10}{figure.caption.6}\protected@file@percent } \newlabel{fig:cop_roles}{{2.6}{10}{Assigned roles for COP meetings\relax }{figure.caption.6}{}} \@writefile{lof}{\contentsline {figure}{\numberline {2.7}{\ignorespaces Assigned roles for SB meetings\relax }}{10}{figure.caption.6}\protected@file@percent } \newlabel{fig:sb_roles}{{2.7}{10}{Assigned roles for SB meetings\relax }{figure.caption.6}{}} \@writefile{lof}{\contentsline {figure}{\numberline {2.8}{\ignorespaces Participants with fossil fuel industry association (COP)\relax }}{10}{figure.caption.7}\protected@file@percent } \newlabel{fig:cop_fossil}{{2.8}{10}{Participants with fossil fuel industry association (COP)\relax }{figure.caption.7}{}} \@writefile{lof}{\contentsline {figure}{\numberline {2.9}{\ignorespaces Participants with fossil fuel industry association (SB)\relax }}{10}{figure.caption.7}\protected@file@percent } \newlabel{fig:sb_fossil}{{2.9}{10}{Participants with fossil fuel industry association (SB)\relax }{figure.caption.7}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.3.4}Experience}{10}{subsection.2.3.4}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {2.10}{\ignorespaces Average Experience Score over time\relax }}{11}{figure.caption.8}\protected@file@percent } \newlabel{fig:expscore_overview}{{2.10}{11}{Average Experience Score over time\relax }{figure.caption.8}{}} \@writefile{toc}{\contentsline {chapter}{\numberline {3}Predictive Modelling}{12}{chapter.3}\protected@file@percent } \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} \newlabel{predictive_modelling}{{3}{12}{Predictive Modelling}{chapter.3}{}} \bibstyle{plain} \bibdata{reference} \@writefile{toc}{\contentsline {chapter}{\numberline {4}Conclusion}{13}{chapter.4}\protected@file@percent } \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} \@writefile{toc}{\contentsline {section}{\numberline {4.1}This is a section}{13}{section.4.1}\protected@file@percent } \bibcite{pdfminer.six}{1} \bibcite{pytesseract}{2} \bibcite{UNFCCC}{3} \bibcite{ipcc:2018}{4} \bibcite{evolution_UNFCCC}{5} \bibcite{tesseract_expl}{6} \bibcite{UNFCCC_docs}{7} \bibcite{UNFCCC_process}{8} \bibcite{UNFCCC_genderreport}{9} \bibcite{coco}{10} \gdef \@abspage@last{16}