extract_participants.py
No OneTemporary
Actions

Subscribers

None

File Metadata

	""" The main script of the cop participants extraction.
	Takes as an argument the number of the cop to process.
	"""

	import os
	import sys

	import partlistproc
	from partlistproc.COP_TextExtractor import COP_TextExtractor
	from partlistproc.COP_Analyzer import COP_Analyzer

	txt_prefix = "../results/participants-txt/"
	csv_prefix = "../results/participants-csv/"
	default_intermediate_name = txt_prefix + "raw_X.txt"
	default_output_name = csv_prefix + "participants_X.csv"


	# format:
	# extract_participants_xopX.py <numberOfCop> <intermediateFilename>
	# <outputFilename>
	# the last option is given if the OCR has already been done (for cop 1 - 4)

	# parse arguments
	arguments = sys.argv
	label = arguments[1]
	intermediate_name = default_intermediate_name.replace("X", label)
	output_name = default_output_name.replace("X", label)
	if(len(arguments) > 2):
	intermediate_name = txt_prefix + arguments[2]
	output_name = csv_prefix + arguments[3]

	# First, extract the text from the pdf if not already done
	if not os.path.isfile(intermediate_name):
	extr = COP_TextExtractor(label, intermediate_name)
	extr.extract_text()

	# Second, extract the data from the text
	ana = COP_Analyzer(label, intermediate_name, output_name)
	ana.get_data()