Page MenuHomec4science

extract_descriptions.py
No OneTemporary

File Metadata

Created
Fri, Jun 28, 21:34

extract_descriptions.py

import pandas as pd
import numpy as np
import re
from collections import Counter
from partlistproc.COP_Analyzer import COP_Analyzer
descriptions = []
participants = pd.read_csv("../results/complete_dataset.csv", encoding="utf-8-sig")
for index, participant in participants.iterrows():
description = str(participant["description"])
description_list = re.split(COP_Analyzer.description_splitter,
description)
description_list = filter(None, description_list)
#description_list = [str(line) for line in description_list]
descriptions.extend(description_list)
if index % 1000 == 0:
print(index)
#print(descriptions)
print("Find the most common lines:")
counter = Counter(descriptions)
print("Totally found " + str(sum(counter.values())) + " distinct lines, the 20 most common being")
print(counter.most_common(20))
# save most common 200
most_common_lines = counter.most_common(200)
output_file = open("most_common_descriptions.txt", "a")
for line, count in most_common_lines:
output_file.write(str(count) + " times the line: " + line)
output_file.write("\n")
output_file.close()

Event Timeline