Page MenuHomec4science

extract_participants.py
No OneTemporary

File Metadata

Created
Sun, Oct 6, 21:26

extract_participants.py

""" The main script of the cop participants extraction.
Takes as an argument the number of the cop to process.
"""
import os
import sys
import partlistproc
from partlistproc.COP_TextExtractor import COP_TextExtractor
from partlistproc.COP_Analyzer import COP_Analyzer
txt_prefix = "../results/participants-txt/"
csv_prefix = "../results/participants-csv/"
default_intermediate_name = txt_prefix + "raw_X.txt"
default_output_name = csv_prefix + "participants_X.csv"
# format:
# extract_participants_xopX.py <numberOfCop> <intermediateFilename>
# <outputFilename>
# the last option is given if the OCR has already been done (for cop 1 - 4)
# parse arguments
arguments = sys.argv
label = arguments[1]
intermediate_name = default_intermediate_name.replace("X", label)
output_name = default_output_name.replace("X", label)
if(len(arguments) > 2):
intermediate_name = txt_prefix + arguments[2]
output_name = csv_prefix + arguments[3]
# First, extract the text from the pdf if not already done
if not os.path.isfile(intermediate_name):
extr = COP_TextExtractor(label, intermediate_name)
extr.extract_text()
# Second, extract the data from the text
ana = COP_Analyzer(label, intermediate_name, output_name)
ana.get_data()

Event Timeline