Page MenuHomec4science

DigitalMeetingAnalyzer.py
No OneTemporary

File Metadata

Created
Sun, Sep 1, 20:22

DigitalMeetingAnalyzer.py

import pandas as pd
import re
from partlistproc.MeetingAnalyzer import MeetingAnalyzer
from partlistproc.PdfExtractor import PdfExtractor
class DigitalMeetingAnalyzer(MeetingAnalyzer):
""" Analyzer for meetings with a digitally generated participant list pdf.
"""
def __init__(self, intermediate_name):
"""
Args:
intermediate_name (str): name of the txt file that contains text
of participant list
"""
self.intermediate_name = intermediate_name
def get_data(self, output_name):
""" Overriding abstract method """
print("Analyze the .txt file to generate the participant data")
file = open(self.intermediate_name, "r", encoding="utf-8")
entire_text = file.read()
# replace the the (continued) from affiliations
entire_text = entire_text.replace(' (continued)', '').replace('(continued)', '')
# split it to a list
entire_list = re.split('\n', entire_text)
# ADDED FOR COP12: delete the empty spaces at beginning
# and at end of each element
size = len(entire_list)
for i in range(size):
while entire_list[i].startswith(' '):
entire_list[i] = entire_list[i][1:]
for i in range(size):
while entire_list[i].endswith(' '):
s = entire_list[i]
entire_list[i] = s[:len(s)-1]
# remove page numbers and document header
entire_list = [el for el in entire_list if
not (el.startswith("Page ")
or el.startswith("FCCC"))]
# update size
size = len(entire_list)
# the resulting dataframe
data = pd.DataFrame(columns={
"name", "affiliation", "affiliation_category", "description"})
# other variables
affiliation = ""
affiliation_cat = ""
name = ""
description = ""
i = 0
# go to first country
while entire_list[i] == "":
i += 1
while i < size:
# check if we're in new affiliation category
if (entire_list[i].lower()).startswith(self.affiliation_categories):
affiliation_cat = entire_list[i].lower()
i += 1
while i < size and entire_list[i] != "":
affiliation_cat += " " + entire_list[i].lower()
i += 1
while i < size and entire_list[i] == "":
i += 1
# new affiliation
affiliation = entire_list[i].lower()
i += 1
# handle affiliations over several lines
while i < size and entire_list[i] != "":
affiliation += " " + entire_list[i].lower()
i += 1
while i < size and entire_list[i] == "":
i += 1
# extract all the names of this affiliation
while i < size and entire_list[i].startswith(self.salutory_addresses):
name = entire_list[i]
i += 1
# the rest is description
while i < size - 1 and entire_list[i] != "" and not entire_list[i].startswith(self.salutory_addresses):
description += entire_list[i] + self.description_splitter
i += 1
# add the person to the dataframe
data = data.append({'name': name,
'affiliation': affiliation,
'affiliation_category': affiliation_cat,
'description': description},
ignore_index=True)
description = ""
while i < size and entire_list[i] == "":
i += 1
# we had a space and no name is next -> new affiliation
# but first, skip all the empty slots
while i < size and entire_list[i] == "":
i += 1
# generate the output file
data.to_csv(output_name, encoding="utf-8-sig", mode="w", index=False)
MeetingAnalyzer.print_small_analysis(self, data)

Event Timeline