Page MenuHomec4science

extract_participants.py
No OneTemporary

File Metadata

Created
Sun, Aug 4, 11:30

extract_participants.py

""" The main script of the cop participants extraction.
Takes as an argument the number of the cop to process.
"""
import os
import sys
import partlistproc
from partlistproc.MeetingAnalyzerFactory import MeetingAnalyzerFactory
from partlistproc.PdfExtractorFactory import PdfExtractorFactory
txt_prefix = "../results/participants-txt/"
csv_prefix = "../results/participants-csv/"
default_intermediate_name = txt_prefix + "raw_X.txt"
default_output_name = csv_prefix + "participants_X.csv"
# format:
# extract_participants_xopX.py <numberOfCop> <intermediateFilename>
# <outputFilename>
# the last option is given if the OCR has already been done (for cop 1 - 4)
# parse arguments
arguments = sys.argv
label = arguments[1]
intermediate_name = default_intermediate_name.replace("X", label)
output_name = default_output_name.replace("X", label)
if(len(arguments) > 2):
intermediate_name = txt_prefix + arguments[2]
output_name = csv_prefix + arguments[3]
# First, extract the text from the pdf if not already done
if not os.path.isfile(intermediate_name):
extr_factory = PdfExtractorFactory(label, intermediate_name)
extr = extr_factory.createPdfExtractor()
extr.extract_text()
# Second, extract the data from the text
ana_factory = MeetingAnalyzerFactory(label, intermediate_name)
ana = ana_factory.get_analyzer()
ana.get_data(output_name)

Event Timeline