Page MenuHomec4science

UppercaseAffiliationMeetingAnalyzer.py
No OneTemporary

File Metadata

Created
Wed, Jul 17, 02:25

UppercaseAffiliationMeetingAnalyzer.py

import pandas as pd
import re
from partlistproc.MeetingAnalyzer import MeetingAnalyzer
class UppercaseAffiliationMeetingAnalyzer(MeetingAnalyzer):
""" Analyzor for meetings that have lists that mark new affiliation with
capital letters
"""
def __init__(self, intermediate_name, encoding=None):
"""
Args:
intermediate_name (str): name of the txt file that contains text
of participant list
encoding (str, optional): how the intermediate file is encoded.
Defaults to None (which can be used for results of tesseract).
"""
self.intermediate_name = intermediate_name
self.encoding = encoding
def get_data(self, output_name):
""" Overriding abstract method """
print("Analyze the .txt file to generate the participant data with ")
file = open(self.intermediate_name, "r", encoding=self.encoding)
entire_text = file.read()
file.close()
entire_text = entire_text.replace(' ', '\n').replace(',', '.').replace('Continued', '')
# split it to a list
content_list = re.split('\n', entire_text)
# delete all the page numbers
content_list = [el for el in content_list if not (el.startswith('-') and el.endswith('-'))]
# init constants
name = ""
description = ""
affiliation = ""
affiliation_cat = "parties"
i = 0
# the resulting dataframe
data = pd.DataFrame(columns={
"name", "affiliation", "affiliation_category", "description"})
# fill in the data row by row
list_size = len(content_list)
while i < list_size:
elem = content_list[i]
# check if it's a new affiliation
if (elem.isupper() and
elem not in self.uppercase_abbrev and
elem[:3].isalpha()):
# store the last person (if there is one)
if name != "":
data = data.append(
{"name": name,
"affiliation": affiliation,
"affiliation_category": affiliation_cat,
"description": description},
ignore_index=True)
name = ""
description = ""
# check if it's new affiliation or category
if elem.lower().startswith(self.affiliation_categories):
affiliation_cat = elem.lower()
# check if affiliation is over several lines
while i + 1 < list_size and content_list[i + 1].isupper():
if content_list[i + 1]:
affiliation_cat += " " + content_list[i + 1].lower()
i += 1
else:
# set the new affiliation
affiliation = elem.lower()
# check if affiliation is over several lines
while i + 1 < list_size and (content_list[i + 1].isupper()
or not content_list[i + 1]):
if content_list[i + 1]:
affiliation += " " + content_list[i + 1].lower()
i += 1
elif(elem.startswith(self.salutory_addresses)):
# a new person
# store the last person
if name != "":
data = data.append(
{'name': name,
'affiliation': affiliation,
"affiliation_category": affiliation_cat,
'description': description},
ignore_index=True)
name = ""
description = ""
# set the new one
name = elem
elif elem != "":
# add it to the actual persons description
description += elem + self.description_splitter
i += 1
# generate the output file
data.to_csv(output_name, encoding="utf-8-sig", index=False)
MeetingAnalyzer.print_small_analysis(self, data)

Event Timeline