Page MenuHomec4science

find_experience.py
No OneTemporary

File Metadata

Created
Wed, Jul 10, 13:06

find_experience.py

import sys
import pandas as pd
import editdistance
import json
# constants
max_distance = 1
min_length_for_linebreak = 15
names = dict() # contains all the unique names in the format (name, list[(meeting, name, affiliation, affiliation_category)])
def compare_names(name1, name2):
# case: one starts with the other (because some words are on next line)
l1 = len(name1)
l2 = len(name2)
if (l1 >= min_length_for_linebreak and
l2 >= min_length_for_linebreak and
(name2.startswith(name1) or name1.startswith(name2)) and
(set(name1.split()) <= set(name2.split()) or
set(name2.split()) <= set(name1.split()))):
return True
# case: first name and last name inversed -> same set of names
if l1 == l2 and set(name2.split()) == set(name1.split()):
return True
# leventshtein difference if the two words have a similar length (value <= 1 possible)
if abs(l1 - l2) > max_distance:
return False
else:
dist = editdistance.eval(name1, name2)
return dist <= max_distance
def get_experience(name, meeting, affiliation, affiliation_category):
"""[summary]
Args:
name ([type]): [description]
meeting ([type]): [description]
affiliation ([type]): [description]
affiliation_category ([type]): [description]
Returns:
int, int, int, int, bool: cop_exp, sb_exp, party_exp, not_party_exp, exp_err_poss
"""
for key_name, participation_list in names.items():
if compare_names(name, key_name):
prev_meetings = names[key_name]
cops = [m for m in prev_meetings if m[0].startswith("cop")]
sbs = [m for m in prev_meetings if m[0].startswith("sb")]
in_party = [m for m in prev_meetings if m[3] == "parties"]
not_party = [m for m in prev_meetings if m[3] != "parties"]
names[key_name].append((meeting, name, affiliation, affiliation_category))
# an error occurs when there is a meeting more than once
err_poss = len(set([m[0] for m in prev_meetings])) != len(names[key_name])
return len(cops), len(sbs), len(in_party), len(not_party), int(err_poss)
names[name] = [(meeting, name, affiliation, affiliation_category)]
return 0, 0, 0, 0, 0
if __name__ == "__main__":
complete_data = pd.read_csv("../results/complete_dataset.csv",
encoding="utf-8-sig")
complete_data_with_experience = pd.DataFrame(columns={
"meeting",
"name",
"gender",
"has_title",
"affiliation",
"affiliation_category",
"role",
"description",
"experience cop",
"experience sb",
"experience party",
"experience not_party",
"experience possible error"})
metadata = pd.read_csv("../data/meetings_metadata.csv")
for label in metadata["label"]:
#for label in ["cop24", "cop25"]:
print(label)
data = complete_data.loc[complete_data.meeting == label]
# print(data.apply(lambda row: pd.Series(get_experience(row["name"], row["meeting"], row["affiliation"], row["affiliation_category"]), axis=1)))
data[["experience cop", "experience sb", "experience party", "experience not_party", "experience possible error"]] = (
data.apply(lambda row: pd.Series(get_experience(row["name"], row["meeting"], row["affiliation"], row["affiliation_category"])), axis=1))
complete_data_with_experience = complete_data_with_experience.append(data, ignore_index=True)
# generate the output file
complete_data_with_experience.to_csv("../results/complete_dataset_experience-def.csv",
encoding="utf-8-sig", index=False)
print(len(names))
# print the dictionary to a text file
f = open("experience_dict_def.txt", "w", encoding="utf-8")
f.write(json.dumps(names))
f.close()
def get_experience_score(delegates_experience):
"""Computes the experience score of an affiliation. This is the average experience of the top 10 most experienced delegates
Args:
delegates_experience (list[int]): The experiences of all the delegates of a party
"""
if len(delegates_experience) <= 10:
return average(delegates_experience)
else:
copy = delegates_experience.copy()
copy.sort(reverse=True)
return average(copy[:10])
def average(numbers):
sum = 0
for n in numbers:
sum += n
return sum / len(numbers)

Event Timeline