Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F71243559
find_experience.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Jul 10, 13:06
Size
4 KB
Mime Type
text/x-python
Expires
Fri, Jul 12, 13:06 (1 d, 21 h)
Engine
blob
Format
Raw Data
Handle
18926165
Attached To
R10013 cop-mining-participants
find_experience.py
View Options
import
sys
import
pandas
as
pd
import
editdistance
import
json
# constants
max_distance
=
1
min_length_for_linebreak
=
15
names
=
dict
()
# contains all the unique names in the format (name, list[(meeting, name, affiliation, affiliation_category)])
def
compare_names
(
name1
,
name2
):
# case: one starts with the other (because some words are on next line)
l1
=
len
(
name1
)
l2
=
len
(
name2
)
if
(
l1
>=
min_length_for_linebreak
and
l2
>=
min_length_for_linebreak
and
(
name2
.
startswith
(
name1
)
or
name1
.
startswith
(
name2
))
and
(
set
(
name1
.
split
())
<=
set
(
name2
.
split
())
or
set
(
name2
.
split
())
<=
set
(
name1
.
split
()))):
return
True
# case: first name and last name inversed -> same set of names
if
l1
==
l2
and
set
(
name2
.
split
())
==
set
(
name1
.
split
()):
return
True
# leventshtein difference if the two words have a similar length (value <= 1 possible)
if
abs
(
l1
-
l2
)
>
max_distance
:
return
False
else
:
dist
=
editdistance
.
eval
(
name1
,
name2
)
return
dist
<=
max_distance
def
get_experience
(
name
,
meeting
,
affiliation
,
affiliation_category
):
"""[summary]
Args:
name ([type]): [description]
meeting ([type]): [description]
affiliation ([type]): [description]
affiliation_category ([type]): [description]
Returns:
int, int, int, int, bool: cop_exp, sb_exp, party_exp, not_party_exp, exp_err_poss
"""
for
key_name
,
participation_list
in
names
.
items
():
if
compare_names
(
name
,
key_name
):
prev_meetings
=
names
[
key_name
]
cops
=
[
m
for
m
in
prev_meetings
if
m
[
0
]
.
startswith
(
"cop"
)]
sbs
=
[
m
for
m
in
prev_meetings
if
m
[
0
]
.
startswith
(
"sb"
)]
in_party
=
[
m
for
m
in
prev_meetings
if
m
[
3
]
==
"parties"
]
not_party
=
[
m
for
m
in
prev_meetings
if
m
[
3
]
!=
"parties"
]
names
[
key_name
]
.
append
((
meeting
,
name
,
affiliation
,
affiliation_category
))
# an error occurs when there is a meeting more than once
err_poss
=
len
(
set
([
m
[
0
]
for
m
in
prev_meetings
]))
!=
len
(
names
[
key_name
])
return
len
(
cops
),
len
(
sbs
),
len
(
in_party
),
len
(
not_party
),
int
(
err_poss
)
names
[
name
]
=
[(
meeting
,
name
,
affiliation
,
affiliation_category
)]
return
0
,
0
,
0
,
0
,
0
if
__name__
==
"__main__"
:
complete_data
=
pd
.
read_csv
(
"../results/complete_dataset.csv"
,
encoding
=
"utf-8-sig"
)
complete_data_with_experience
=
pd
.
DataFrame
(
columns
=
{
"meeting"
,
"name"
,
"gender"
,
"has_title"
,
"affiliation"
,
"affiliation_category"
,
"role"
,
"description"
,
"experience cop"
,
"experience sb"
,
"experience party"
,
"experience not_party"
,
"experience possible error"
})
metadata
=
pd
.
read_csv
(
"../data/meetings_metadata.csv"
)
for
label
in
metadata
[
"label"
]:
#for label in ["cop24", "cop25"]:
print
(
label
)
data
=
complete_data
.
loc
[
complete_data
.
meeting
==
label
]
# print(data.apply(lambda row: pd.Series(get_experience(row["name"], row["meeting"], row["affiliation"], row["affiliation_category"]), axis=1)))
data
[[
"experience cop"
,
"experience sb"
,
"experience party"
,
"experience not_party"
,
"experience possible error"
]]
=
(
data
.
apply
(
lambda
row
:
pd
.
Series
(
get_experience
(
row
[
"name"
],
row
[
"meeting"
],
row
[
"affiliation"
],
row
[
"affiliation_category"
])),
axis
=
1
))
complete_data_with_experience
=
complete_data_with_experience
.
append
(
data
,
ignore_index
=
True
)
# generate the output file
complete_data_with_experience
.
to_csv
(
"../results/complete_dataset_experience-def.csv"
,
encoding
=
"utf-8-sig"
,
index
=
False
)
print
(
len
(
names
))
# print the dictionary to a text file
f
=
open
(
"experience_dict_def.txt"
,
"w"
,
encoding
=
"utf-8"
)
f
.
write
(
json
.
dumps
(
names
))
f
.
close
()
def
get_experience_score
(
delegates_experience
):
"""Computes the experience score of an affiliation. This is the average experience of the top 10 most experienced delegates
Args:
delegates_experience (list[int]): The experiences of all the delegates of a party
"""
if
len
(
delegates_experience
)
<=
10
:
return
average
(
delegates_experience
)
else
:
copy
=
delegates_experience
.
copy
()
copy
.
sort
(
reverse
=
True
)
return
average
(
copy
[:
10
])
def
average
(
numbers
):
sum
=
0
for
n
in
numbers
:
sum
+=
n
return
sum
/
len
(
numbers
)
Event Timeline
Log In to Comment