Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F72792453
UppercaseAffiliationMeetingAnalyzer.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Jul 17, 02:25
Size
4 KB
Mime Type
text/x-python
Expires
Fri, Jul 19, 02:25 (2 d)
Engine
blob
Format
Raw Data
Handle
19103705
Attached To
R10013 cop-mining-participants
UppercaseAffiliationMeetingAnalyzer.py
View Options
import
pandas
as
pd
import
re
from
partlistproc.MeetingAnalyzer
import
MeetingAnalyzer
class
UppercaseAffiliationMeetingAnalyzer
(
MeetingAnalyzer
):
""" Analyzor for meetings that have lists that mark new affiliation with
capital letters
"""
def
__init__
(
self
,
intermediate_name
,
encoding
=
None
):
"""
Args:
intermediate_name (str): name of the txt file that contains text
of participant list
encoding (str, optional): how the intermediate file is encoded.
Defaults to None (which can be used for results of tesseract).
"""
self
.
intermediate_name
=
intermediate_name
self
.
encoding
=
encoding
def
get_data
(
self
,
output_name
):
""" Overriding abstract method """
print
(
"Analyze the .txt file to generate the participant data with "
)
file
=
open
(
self
.
intermediate_name
,
"r"
,
encoding
=
self
.
encoding
)
entire_text
=
file
.
read
()
file
.
close
()
entire_text
=
entire_text
.
replace
(
''
,
'
\n
'
)
.
replace
(
','
,
'.'
)
.
replace
(
'Continued'
,
''
)
# split it to a list
content_list
=
re
.
split
(
'
\n
'
,
entire_text
)
# delete all the page numbers
content_list
=
[
el
for
el
in
content_list
if
not
(
el
.
startswith
(
'-'
)
and
el
.
endswith
(
'-'
))]
# init constants
name
=
""
description
=
""
affiliation
=
""
affiliation_cat
=
"parties"
i
=
0
# the resulting dataframe
data
=
pd
.
DataFrame
(
columns
=
{
"name"
,
"affiliation"
,
"affiliation_category"
,
"description"
})
# fill in the data row by row
list_size
=
len
(
content_list
)
while
i
<
list_size
:
elem
=
content_list
[
i
]
# check if it's a new affiliation
if
(
elem
.
isupper
()
and
elem
not
in
self
.
uppercase_abbrev
and
elem
[:
3
]
.
isalpha
()):
# store the last person (if there is one)
if
name
!=
""
:
data
=
data
.
append
(
{
"name"
:
name
,
"affiliation"
:
affiliation
,
"affiliation_category"
:
affiliation_cat
,
"description"
:
description
},
ignore_index
=
True
)
name
=
""
description
=
""
# check if it's new affiliation or category
if
elem
.
lower
()
.
startswith
(
self
.
affiliation_categories
):
affiliation_cat
=
elem
.
lower
()
# check if affiliation is over several lines
while
i
+
1
<
list_size
and
content_list
[
i
+
1
]
.
isupper
():
if
content_list
[
i
+
1
]:
affiliation_cat
+=
" "
+
content_list
[
i
+
1
]
.
lower
()
i
+=
1
else
:
# set the new affiliation
affiliation
=
elem
.
lower
()
# check if affiliation is over several lines
while
i
+
1
<
list_size
and
(
content_list
[
i
+
1
]
.
isupper
()
or
not
content_list
[
i
+
1
]):
if
content_list
[
i
+
1
]:
affiliation
+=
" "
+
content_list
[
i
+
1
]
.
lower
()
i
+=
1
elif
(
elem
.
startswith
(
self
.
salutory_addresses
)):
# a new person
# store the last person
if
name
!=
""
:
data
=
data
.
append
(
{
'name'
:
name
,
'affiliation'
:
affiliation
,
"affiliation_category"
:
affiliation_cat
,
'description'
:
description
},
ignore_index
=
True
)
name
=
""
description
=
""
# set the new one
name
=
elem
elif
elem
!=
""
:
# add it to the actual persons description
description
+=
elem
+
self
.
description_splitter
i
+=
1
# generate the output file
data
.
to_csv
(
output_name
,
encoding
=
"utf-8-sig"
,
index
=
False
)
MeetingAnalyzer
.
print_small_analysis
(
self
,
data
)
Event Timeline
Log In to Comment