Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F86502110
COP1to5_Analyzer.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Oct 6, 21:23
Size
4 KB
Mime Type
text/x-python
Expires
Tue, Oct 8, 21:23 (1 d, 22 h)
Engine
blob
Format
Raw Data
Handle
21432651
Attached To
R10013 cop-mining-participants
COP1to5_Analyzer.py
View Options
import
pandas
as
pd
import
re
import
partlistproc.COP_Analyzer
as
Analyzer
# TODO rather "Uppercase_affiliation_Analyzor"
class
COP1to5_Analyzer
():
def
__init__
(
self
,
intermediate_name
,
encoding
=
None
):
"""
Args:
intermediate_name (str): [description]
encoding (str, optional): how the intermediate file is encoded. Defaults to "utf-8" (which can be used for results of tesseract).
"""
self
.
intermediate_name
=
intermediate_name
self
.
encoding
=
encoding
def
get_data
(
self
,
output_name
):
""" gets the data from the specified source file
Args:
output_name (str): name of the output file to be generated
"""
print
(
"Analyze the .txt file to generate the participant data"
)
file
=
open
(
self
.
intermediate_name
,
"r"
,
encoding
=
self
.
encoding
)
entire_text
=
file
.
read
()
file
.
close
()
entire_text
=
entire_text
.
replace
(
''
,
'
\n
'
)
.
replace
(
','
,
'.'
)
.
replace
(
'Continued'
,
''
)
# split it to a list
content_list
=
re
.
split
(
'
\n
'
,
entire_text
)
# delete all the page numbers
content_list
=
[
el
for
el
in
content_list
if
not
(
el
.
startswith
(
'-'
)
and
el
.
endswith
(
'-'
))]
# init constants
name
=
""
description
=
""
affiliation
=
""
affiliation_cat
=
""
i
=
0
# the resulting dataframe
data
=
pd
.
DataFrame
(
columns
=
{
"name"
,
"affiliation"
,
"affiliation_category"
,
"description"
})
# fill in the data row by row
list_size
=
len
(
content_list
)
while
i
<
list_size
:
elem
=
content_list
[
i
]
# check if it's a new affiliation
if
(
elem
.
isupper
()
and
elem
not
in
Analyzer
.
COP_Analyzer
.
uppercase_abbrev
and
elem
[:
3
]
.
isalpha
()):
# store the last person (if there is one)
if
name
!=
""
:
data
=
data
.
append
(
{
"name"
:
name
,
"affiliation"
:
affiliation
,
"affiliation_category"
:
affiliation_cat
,
"description"
:
description
},
ignore_index
=
True
)
name
=
""
description
=
""
# check if it's new affiliation or category
if
elem
.
lower
()
.
startswith
(
Analyzer
.
COP_Analyzer
.
affiliation_categories
):
affiliation_cat
=
elem
.
lower
()
# check if affiliation is over several lines
while
i
+
1
<
list_size
and
content_list
[
i
+
1
]
.
isupper
():
if
content_list
[
i
+
1
]:
affiliation_cat
+=
" "
+
content_list
[
i
+
1
]
.
lower
()
i
+=
1
else
:
# set the new affiliation
affiliation
=
elem
.
lower
()
# check if affiliation is over several lines
while
i
+
1
<
list_size
and
(
content_list
[
i
+
1
]
.
isupper
()
or
not
content_list
[
i
+
1
]):
if
content_list
[
i
+
1
]:
affiliation
+=
" "
+
content_list
[
i
+
1
]
.
lower
()
i
+=
1
elif
(
elem
.
startswith
(
Analyzer
.
COP_Analyzer
.
salutory_addresses
)):
# a new person
# store the last person
if
name
!=
""
:
data
=
data
.
append
(
{
'name'
:
name
,
'affiliation'
:
affiliation
,
"affiliation_category"
:
affiliation_cat
,
'description'
:
description
},
ignore_index
=
True
)
name
=
""
description
=
""
# set the new one
name
=
elem
elif
elem
!=
""
:
# add it to the actual persons description
description
+=
elem
+
Analyzer
.
COP_Analyzer
.
description_splitter
i
+=
1
# generate the output file
data
.
to_csv
(
output_name
,
encoding
=
"utf-8-sig"
)
print
(
"do some analysis ---------------------------------------------"
)
per_cat
=
data
.
groupby
(
'affiliation_category'
)
for
cat
,
people
in
per_cat
:
print
(
cat
+
" "
+
str
(
len
(
people
)))
print
(
"The number of detected participants is "
+
str
(
len
(
data
.
index
)))
Event Timeline
Log In to Comment