File Metadata

Created: Sun, Oct 6, 21:23

COP1to5_Analyzer.py
View Options

	import pandas as pd
	import re

	import partlistproc.COP_Analyzer as Analyzer

	# TODO rather "Uppercase_affiliation_Analyzor"
	class COP1to5_Analyzer():

	def __init__(self, intermediate_name, encoding=None):
	"""
	Args:
	intermediate_name (str): [description]
	encoding (str, optional): how the intermediate file is encoded. Defaults to "utf-8" (which can be used for results of tesseract).
	"""
	self.intermediate_name = intermediate_name
	self.encoding = encoding

	def get_data(self, output_name):
	""" gets the data from the specified source file

	Args:
	output_name (str): name of the output file to be generated
	"""
	print("Analyze the .txt file to generate the participant data")
	file = open(self.intermediate_name, "r", encoding=self.encoding)

	entire_text = file.read()
	file.close()
	entire_text = entire_text.replace('', '\n').replace(',', '.').replace('Continued', '')

	# split it to a list
	content_list = re.split('\n', entire_text)

	# delete all the page numbers
	content_list = [el for el in content_list if not (el.startswith('-') and el.endswith('-'))]

	# init constants
	name = ""
	description = ""
	affiliation = ""
	affiliation_cat = ""
	i = 0

	# the resulting dataframe
	data = pd.DataFrame(columns={
	"name", "affiliation", "affiliation_category", "description"})

	# fill in the data row by row
	list_size = len(content_list)
	while i < list_size:
	elem = content_list[i]
	# check if it's a new affiliation
	if (elem.isupper() and
	elem not in Analyzer.COP_Analyzer.uppercase_abbrev and
	elem[:3].isalpha()):
	# store the last person (if there is one)
	if name != "":
	data = data.append(
	{"name": name,
	"affiliation": affiliation,
	"affiliation_category": affiliation_cat,
	"description": description},
	ignore_index=True)
	name = ""
	description = ""

	# check if it's new affiliation or category
	if elem.lower().startswith(Analyzer.COP_Analyzer.affiliation_categories):
	affiliation_cat = elem.lower()
	# check if affiliation is over several lines
	while i + 1 < list_size and content_list[i + 1].isupper():
	if content_list[i + 1]:
	affiliation_cat += " " + content_list[i + 1].lower()
	i += 1
	else:
	# set the new affiliation
	affiliation = elem.lower()
	# check if affiliation is over several lines
	while i + 1 < list_size and (content_list[i + 1].isupper()
	or not content_list[i + 1]):
	if content_list[i + 1]:
	affiliation += " " + content_list[i + 1].lower()
	i += 1
	elif(elem.startswith(Analyzer.COP_Analyzer.salutory_addresses)):
	# a new person
	# store the last person
	if name != "":
	data = data.append(
	{'name': name,
	'affiliation': affiliation,
	"affiliation_category": affiliation_cat,
	'description': description},
	ignore_index=True)

	name = ""
	description = ""
	# set the new one
	name = elem
	elif elem != "":
	# add it to the actual persons description
	description += elem + Analyzer.COP_Analyzer.description_splitter

	i += 1


	# generate the output file
	data.to_csv(output_name, encoding="utf-8-sig")

	print("do some analysis ---------------------------------------------")

	per_cat = data.groupby('affiliation_category')
	for cat, people in per_cat:
	print(cat + " " + str(len(people)))

	print("The number of detected participants is " + str(len(data.index)))

COP1to5_Analyzer.py
No OneTemporary
Actions

File Metadata

COP1to5_Analyzer.py
View Options

Event Timeline

COP1to5_Analyzer.pyNo OneTemporaryActions

File Metadata

COP1to5_Analyzer.pyView Options

Event Timeline

COP1to5_Analyzer.py
No OneTemporary
Actions

COP1to5_Analyzer.py
View Options