diff --git a/code/scripts/find_most_common_word_nokeyword.py b/code/scripts/find_most_common_word_nokeyword.py new file mode 100644 index 0000000..3a0f3aa --- /dev/null +++ b/code/scripts/find_most_common_word_nokeyword.py @@ -0,0 +1,31 @@ +import pandas as pd + +complete_data = pd.read_csv("../results/complete_dataset.csv", + encoding="utf-8-sig") + +no_keyword_participants = complete_data.loc[complete_data["role"] == "no keyword found"] +no_keyword_participants = no_keyword_participants.loc[no_keyword_participants["affiliation_category"] == "parties"] + +words_dict = dict() +line_dict = dict() + +for description in no_keyword_participants["description"]: + description = description.replace(";", " ") + # description + if description in line_dict: + line_dict[description] += 1 + else: + line_dict[description] = 1 + # words + for word in description.split(" "): + if word in words_dict: + words_dict[word] += 1 + else: + words_dict[word] = 1 + +sorted_word_dict = sorted(words_dict.items(), key=lambda x: x[1], reverse=True) +sorted_line_dict = sorted(line_dict.items(), key=lambda x: x[1], reverse=True) +print("Most common words:") +print(sorted_word_dict[:100]) +print("Most common lines:") +print(sorted_line_dict[:100]) \ No newline at end of file diff --git a/code/scripts/generate_complete_dataset.py b/code/scripts/generate_complete_dataset.py index 3b6fa3a..b9c677e 100644 --- a/code/scripts/generate_complete_dataset.py +++ b/code/scripts/generate_complete_dataset.py @@ -1,203 +1,203 @@ import pandas as pd import country_converter as coco import editdistance import os import re import partlistproc.MeetingAnalyzer as Ana import partlistproc.MeetingAnalyzerFactory as AnaFac """ This script generates one csv file containing all the participants of all the available meetings (information taken from metadata file) """ def is_male(name): return any(title in name for title in Ana.MeetingAnalyzer.masculine_salutory_addresses) def is_female(name): return any(title in name for title in Ana.MeetingAnalyzer.feminine_salutory_addresses) def has_title(name): return any(title in name for title in Ana.MeetingAnalyzer.titles) def has_no_title(name): return not has_title(name) def get_role(description): # EDIT: I had to redesign this to make keywords of several words possible (28.11.20) description = str(description) if description == "nan": return "no description" splitted = re.split('[; ]{1}', description) for key_line in roles_dict.keys(): keywords = re.split(" ", key_line) if(str(keywords[0]) in splitted or (str(keywords[0])).lower() in splitted): if(len(keywords) == 1): return roles_dict[key_line] else: # keyword is more than one word: check the others found_word = str(keywords[0]) if found_word not in splitted: found_word = found_word.lower() index = splitted.index(found_word) size = len(splitted) for i in range(1, len(keywords)): if index + i >= size or str(keywords[i]).lower() != str(splitted[index + i]).lower(): - return "no keyword found" + continue return roles_dict[key_line] return "no keyword found" def clear_name(name): """removes all salutory addresses and titles from a given name Args: name (str): the name to be cleared """ cleared_name = name while cleared_name.startswith(Ana.MeetingAnalyzer.salutory_addresses): startindex = cleared_name.find(" ") if startindex == -1: return cleared_name startindex += 1 cleared_name = cleared_name[startindex:] return cleared_name.lower() short_country_names = {} def simplify_country_name(affiliation): if affiliation in short_country_names: return short_country_names[affiliation] else: # None for not found makes that it returns the input value converted = (coco.convert(names=[affiliation], to="name_short", not_found=None)) if isinstance(converted, list): converted = converted[0] short_country_names[affiliation] = converted return converted def is_fossil_fuel_associated(words): """checks if the given string contains a fossil fuel industry keyword Args: words (str): the string to be tested for keywords """ splitted = re.split('[; ]{1}', (str(words)).lower()) for keyword in fossil_fuel_keywords: if keyword in splitted: return True return False # pre-processing # extract the list of roles roles_dict = {} roles_file = open("../data/dictionaries/role_keywords.txt", "r", encoding="utf-8") role_lines = roles_file.readlines() current_role = "" for line in role_lines: if "\n" in line: line = line[:line.index("\n")] if line.startswith("["): if not line.endswith("]"): raise KeyError("Format on line {} was incorrect".format(line)) current_role = line[1:len(line) - 1] else: if line != "": roles_dict[line] = current_role # extract the list of fossil fuel industry keywords fossil_fuel_keywords = [] ff_file = open("../data/dictionaries/fossil_fuel_industry_keywords.txt", "r", encoding="utf-8") ff_lines = ff_file.readlines() for line in ff_lines: if "\n" in line: line = line[:line.index("\n")] if line != "": fossil_fuel_keywords.append(line.lower()) # begin with the real processing complete_data = pd.DataFrame(columns={ "meeting", "name", "gender", "has_title", "affiliation", "affiliation_category", "role", "fossil_fuel_industry", "description"}) metadata = pd.read_csv("../data/meetings_metadata.csv") for label in metadata["label"]: datafile_name = "../results/participants-csv/participants_" + label + ".csv" if label in AnaFac.MeetingAnalyzerFactory.french_meetings: datafile_name = "../results/participants-csv/participants_" + label + "-en.csv" if not os.path.isfile(datafile_name): os.system("python extract_participants.py " + label) os.system("python translate_list_fr_en.py " + label) # generate the data if not yet available if not os.path.isfile(datafile_name): os.system("python extract_participants.py " + label) # open the data from this cop cop_data = pd.read_csv(datafile_name, encoding="utf-8-sig") # add its data to the complete dataframe cop_data["meeting"] = label # determine gender cop_data.loc[cop_data.name.apply(is_male), "gender"] = "m" cop_data.loc[cop_data.name.apply(is_female), "gender"] = "f" # determine title (if any) cop_data.loc[cop_data.name.apply(has_title), "has_title"] = 1 cop_data.loc[cop_data.name.apply(has_no_title), "has_title"] = 0 # define the role cop_data["role"] = cop_data["description"].apply(get_role) # define the association to fossil fuel industry cop_data["fossil_fuel_industry"] = 0 cop_data.loc[cop_data.description.apply(is_fossil_fuel_associated), "fossil_fuel_industry"] = 1 cop_data.loc[cop_data.affiliation.apply(is_fossil_fuel_associated), "fossil_fuel_industry"] = 1 # clear up the name cop_data["name"] = cop_data["name"].apply(clear_name) # unify the country names cop_data.loc[cop_data.affiliation_category.apply(lambda p: p == "parties"), "affiliation"] = cop_data.loc[cop_data.affiliation_category.apply(lambda p: p == "parties"), "affiliation"].apply(simplify_country_name) print(label) print(cop_data[:5]) complete_data = complete_data.append(cop_data, ignore_index=True) # only for a short time grouped_by_role = complete_data.groupby("role") for role, rest in grouped_by_role: print(f"{role}: {len(rest)} participants found") print(f"Country names map of length {len(short_country_names)}") print(short_country_names) short_country_names_cleaned = {k: v for (k, v) in short_country_names.items() if k != v} country_set = set(short_country_names_cleaned.values()) print(f"Set of length {len(country_set)}") print(country_set) f = open("../data/dictionaries/valid_countries.txt", "w") for country in country_set: f.write(str(country) + "\n") f.close() # generate the output file complete_data.to_csv("../results/complete_dataset.csv", encoding="utf-8-sig", index=False) diff --git a/code/scripts/generate_plots.py b/code/scripts/generate_plots.py index a02e777..4d923cd 100644 --- a/code/scripts/generate_plots.py +++ b/code/scripts/generate_plots.py @@ -1,34 +1,34 @@ import os import matplotlib.pyplot as plt import pandas as pd import plots.plot_fossil_fuel_industry as plot_fossil_fuel_industry import plots.plot_government as plot_government import plots.plot_experience as plot_experience import plots.plot_categories as plot_categories import plots.plot_missing_participants as plot_missing_participants import plots.plot_gender_rate as plot_gender_rate import plots.plot_delegation_sizes as plot_delegation_sizes import plots.plot_overall_experience_distr as plot_overall_experience_distr import plots.plot_delegation_exp as plot_delegation_exp import plots.plot_intervention_distr as plot_intervention_distr -plot_intervention_distr.plot("../data/data_regression/dataset_interventions.csv") plot_government.plot("../results/complete_dataset.csv") +plot_intervention_distr.plot("../data/data_regression/dataset_interventions.csv") plot_experience.plot("../results/complete_dataset_experience-2.csv") plot_fossil_fuel_industry.plot("../results/complete_dataset.csv") plot_missing_participants.plot("../results/participants-csv/participants_cop") plot_delegation_exp.plot("../results/complete_dataset_experience-2.csv") plot_overall_experience_distr.plot("../results/complete_dataset_experience-2.csv") plot_categories.plot("../results/participants-csv/participants_cop") plot_gender_rate.plot("../results/complete_dataset.csv") plot_delegation_sizes.plot("../results/complete_dataset.csv") diff --git a/code/scripts/plots/plot_government.py b/code/scripts/plots/plot_government.py index 3358cbc..723c334 100644 --- a/code/scripts/plots/plot_government.py +++ b/code/scripts/plots/plot_government.py @@ -1,90 +1,94 @@ import pandas as pd import matplotlib.pyplot as plt def get_roles(people): counter = len(people) gov = len(people.loc[people["role"] == "government"]) / counter dipl = len(people.loc[people["role"] == "diplomacy"]) / counter sec = len(people.loc[people["role"] == "security"]) / counter press = len(people.loc[people["role"] == "press"]) / counter + uni = len(people.loc[people["role"] == "universities"]) / counter nodescr = len(people.loc[people["role"] == "no description"]) / counter nokeyword = len(people.loc[people["role"] == "no keyword found"]) / counter - return gov, dipl, sec, press, nodescr, nokeyword + return gov, dipl, sec, press, uni, nodescr, nokeyword def plot(path): complete_data = pd.read_csv(path, encoding="utf-8-sig") parties = complete_data.loc[complete_data["affiliation_category"] == "parties"] metadata = pd.read_csv("../data/meetings_metadata.csv") result_df = pd.DataFrame(columns={"label", "government", "diplomacy", - "security", "press", "no description", - "no keyword found"}) + "security", "press", "universities", + "no description", "no keyword found"}) for label in metadata["label"]: this_meeting = parties.loc[parties["meeting"] == label] by_party = this_meeting.groupby("affiliation") gov_acc = 0 dipl_acc = 0 sec_acc = 0 press_acc = 0 + uni_acc = 0 nodescr_acc = 0 nokeyword_acc = 0 for aff, people in by_party: - gov, dipl, sec, press, nodescr, nokeyword = get_roles(people) + gov, dipl, sec, press, uni, nodescr, nokeyword = get_roles(people) gov_acc += gov dipl_acc += dipl sec_acc += sec press_acc += press + uni_acc += uni nodescr_acc += nodescr nokeyword_acc += nokeyword n_parties = len(by_party) result_df = result_df.append({ "label": label, "government": gov_acc / n_parties, "diplomacy": dipl_acc / n_parties, "security": sec_acc / n_parties, "press": press_acc / n_parties, + "universities": uni_acc / n_parties, "no description": nodescr_acc / n_parties, "no keyword found": nokeyword_acc / n_parties }, ignore_index=True) # First, plot for COPs - ordered_categories = ["government", "diplomacy", "security", "press", "no keyword found", "no description"] + ordered_categories = ["government", "diplomacy", "security", "press", "universities", "no keyword found", "no description"] colors = {"no description": "C5", "government": "C1", "diplomacy": "C2", - "security": "C6", "press": "C4", "no keyword found": "C0"} + "security": "C6", "press": "C4", "universities": "C8", "no keyword found": "C0"} plot_data = result_df.loc[result_df.label.apply(lambda l: l.startswith("cop"))] plot_data = plot_data.set_index("label") plot_data = pd.DataFrame(plot_data) plot_data.columns = pd.CategoricalIndex(plot_data.columns.values, ordered=True, categories=ordered_categories) # Sort the columns (axis=1) by the new categorical ordering plot_data = plot_data.sort_index(axis=1) plot_data.plot(kind="area", xlabel="Meeting", ylabel="Avg rate of roles per party", title="Roles in parties (COP)", stacked=True, color=colors, ylim=[0,1]) # Second, plot for SBs plot_data2 = result_df.loc[result_df.label.apply(lambda l: l.startswith("sb"))] plot_data2 = plot_data2.set_index("label") #plot_data2 = plot_data2.unstack(level=0) plot_data2 = pd.DataFrame(plot_data2) plot_data2.columns = pd.CategoricalIndex(plot_data2.columns.values, ordered=True, categories=ordered_categories) plot_data2 = plot_data2.sort_index(axis=1) plot_data2.plot(kind="area", xlabel="Meeting", ylabel="Avg rate of roles per party", title="Roles in parties (SB)", stacked=True, color=colors, ylim=[0,1]) plt.show()