Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F68776996
extract_descriptions.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Jun 28, 21:34
Size
1 KB
Mime Type
text/x-python
Expires
Sun, Jun 30, 21:34 (2 d)
Engine
blob
Format
Raw Data
Handle
18567046
Attached To
R10013 cop-mining-participants
extract_descriptions.py
View Options
import
pandas
as
pd
import
numpy
as
np
import
re
from
collections
import
Counter
from
partlistproc.COP_Analyzer
import
COP_Analyzer
descriptions
=
[]
participants
=
pd
.
read_csv
(
"../results/complete_dataset.csv"
,
encoding
=
"utf-8-sig"
)
for
index
,
participant
in
participants
.
iterrows
():
description
=
str
(
participant
[
"description"
])
description_list
=
re
.
split
(
COP_Analyzer
.
description_splitter
,
description
)
description_list
=
filter
(
None
,
description_list
)
#description_list = [str(line) for line in description_list]
descriptions
.
extend
(
description_list
)
if
index
%
1000
==
0
:
print
(
index
)
#print(descriptions)
print
(
"Find the most common lines:"
)
counter
=
Counter
(
descriptions
)
print
(
"Totally found "
+
str
(
sum
(
counter
.
values
()))
+
" distinct lines, the 20 most common being"
)
print
(
counter
.
most_common
(
20
))
# save most common 200
most_common_lines
=
counter
.
most_common
(
200
)
output_file
=
open
(
"most_common_descriptions.txt"
,
"a"
)
for
line
,
count
in
most_common_lines
:
output_file
.
write
(
str
(
count
)
+
" times the line: "
+
line
)
output_file
.
write
(
"
\n
"
)
output_file
.
close
()
Event Timeline
Log In to Comment