diff --git a/code/COP_Analyzer.py b/code/COP_Analyzer.py new file mode 100644 index 0000000..3d0ca7e --- /dev/null +++ b/code/COP_Analyzer.py @@ -0,0 +1,10 @@ + + +class COP_Analyzer(): + def __init__(self, copN, intermediate_name, output_name): + self.copN = copN + self.intermediate_name = intermediate_name + self.output_name = output_name + + ## maybe make this an abstract class + \ No newline at end of file diff --git a/code/COP_Extractor.py b/code/COP_Extractor.py new file mode 100644 index 0000000..0a4fb28 --- /dev/null +++ b/code/COP_Extractor.py @@ -0,0 +1,100 @@ + + +class COP_TextExtractor():ยจ + """Extracts the text from a PDF participant list""" + + def __init__(self, copN, output_file): + """ Constructor of this class + + Args: + copN (int): number of the edition of cop to process + output_file (string): name of the file to put the text in + """ + self.copN = copN + self.intermediate_name = intermediate_name + + def doOCR(self, startpage, endpage): + """Performs OCR with tesseract on a PDF file. + + Args: + startpage (int): first page to process + endpage (int): last page to process (exclusive) + + Returns: + boolean: True if the extraction was sucessful + """ + # insert boxes that help for the OCR on the critical pages + print("Insert boxes where necessary") + box = "files\\column_block.pdf" + f_box = open(box, "rb") + pdf_box = PyPDF2.PdfFileReader(f_box) + + PDF_file = self.__getPDFpath() + file = open(PDF_file, "rb") + file_read = PyPDF2.PdfFileReader(file) + + output = PyPDF2.PdfFileWriter() + + for i in range(file_read.getNumPages()): + page = file_read.getPage(i) + # insert the boxes where necessary + if(i in box_pages[self.copN - 1]): + page.mergePage(pdf_box.getPage(0)) + output.addPage(page) + + temporary_filename = "temp.pdf" + boxed_file = open(temporary_filename, "wb") + output.write(boxed_file) + f_box.close() + file.close() + boxed_file.close() + + # start of the OCR procedure + print("Create images from the pdf file") + pages = convert_from_path(temporary_filename, dpi=200) + + # check indexes + if startpage == 0 and endpage == 0: + # set to default page space + startpage = default_startpage[self.copN] + endpage = len(pages) + elif len(pages) < endpage or endpage < startpage or startpage <= 0: + return False + # Correct the page numbers s.t. they begin counting from 0 and + # end is exclusive + startpage -= 1 + + # Open the file in write mode so that + # all contents of all pages are added to the same file + f = open(self.intermediate_name, "w") + + # Iterate through all the pages stored above + for i in range(startpage, endpage): + print("Reading page " + str(i + 1)) + + # Recognize the text as string in image using pytesseract + text = str(((pytesseract.image_to_string(pages[i], + config="--psm 3 --oem 1")))) + + # Write the processed text to the file. + f.write(text) + + f.close() + os.remove("temp.pdf") + + def __doPDFtoTxt(self): + """PDF_file = self.__getPDFpath() + f = open(PDF_file, "rb") + pdfReader = PyPDF2.PdfFileReader(f) + size = pdfReader.numPages + f = open(self.intermediate_name, "a") + for i in range(size): + f.write(pdfReader.getPage(i).extractText()) + f.close()""" + PDF_file = self.__getPDFpath() + text = textract.process(PDF_file, method='tesseract') + print(text) + f = open(self.intermediate_name, "a") + f.write(str(text)) + f.close() + return True \ No newline at end of file diff --git a/code/extract_participants_copX.py b/code/extract_participants_copX.py new file mode 100644 index 0000000..a43e0f2 --- /dev/null +++ b/code/extract_participants_copX.py @@ -0,0 +1,41 @@ +""" The main script of the cop participants extraction. +Takes as an argument the number of the cop to process. +""" + +# Constants +# This is used to differ names from abbreviations +uppercase_abbrev = ["US", "USA", "AO", "UK", "WWF-US", "WWF-UK", "EPFL"] +# Must be a tuple for the funtion "startsWith" of String +salutory_addresses = ("Mr", "Ms", "Sr", "Sra", "H.E.", "S.E.", "M.", + "Mme", "Dr.", "Drs.") +default_startpage = [126, 2, 3, 3] # TODO add for copN > 4 +# Where to add boxes for ocr, index == copX +box_pages = [[], range(47, 60), [], []] + +seperator = "#" + + +# format: +# extract_participants_xopX.py +# ( ) +# the last option is given if the OCR has already been done (for cop 1 - 4) + +# parse arguments +arguments = sys.argv +copNumber = int(arguments[1]) +intermediateFilename = arguments[2] +outputFilename = arguments[3] +startpage = 0 +endpage = 0 +if(len(arguments) == 6): + startpage = int(arguments[4]) + endpage = int(arguments[5]) + +# TODO replace +proc = COP_Processor(copNumber, intermediateFilename, outputFilename) +success = proc.pdfToData(startpage, endpage) + +if success: + print("The data has successfully been extracted") +else: + print("The data couldn't be extracted correctly. Maybe this cop is not implemented yet.") \ No newline at end of file