diff --git a/code/index_analysis_cop1.py b/code/index_analysis_cop1.py deleted file mode 100644 index 5ff4349..0000000 --- a/code/index_analysis_cop1.py +++ /dev/null @@ -1,78 +0,0 @@ -## Jan Linder, 21.09.20 -## Analyses the index part of the cop 1 - -import pandas as pd -import re - -# constants -uppercase_abbrev = ["US", "USA", "AO", "UK"] - - -filename = "test_indexCOP1.txt" - -file = open(filename, "r") - -print("Read file and replace some words") -entire_text = file.read() -entire_text = entire_text.replace('Affilliation/Country', '').replace('Name', '').replace('—','').replace('‘', '').replace('|', '').replace('(', '').replace('{', '').replace('[', '') - -# split it to a list -print("split the text up to a list") -entire_text_list = re.split(', | |\n', entire_text) - -# init constants -c = 'a' -familyName = "" -firstName = "" -party = "" -i = 0 - -# remove the empty slots -print("remove the empty slots") -content_list = [el for el in entire_text_list if el] - - -# the resulting dataframe -data = pd.DataFrame(columns = {"family name", "first name", "party"}) - -# fill in the data row by row -print("Extract the data") -list_size = len(content_list) -while i < list_size: - # go to the next word that is a last name (uppercase) - while not content_list[i].isupper(): - i += 1 - - familyName = content_list[i] - i += 1 - #if family name is more than one word - while content_list[i].isupper(): - familyName += " " + content_list[i] - i += 1 - - # assume that first name is only one word - firstName = content_list[i] - i += 1 - - # assume that the rest is the organization - while not (i >= list_size or (content_list[i].isupper() and not content_list[i] in uppercaseAbbrev)): - party = party + " " + content_list[i] - i += 1 - - # append the new row to the dataframe - data = data.append({'family name': familyName, 'first name': firstName, 'party': party}, ignore_index = True) - # print the row - print(familyName + ", " + firstName + ": " + party) - - #clear the party - party = "" - -print("do some analysis -------------------------------------------------------------") -# sort per party and count -byParty = data.groupby('party') -for organization, people in byParty: - print(organization) - print(people) - -print("The number of detected participants is " + str(len(data.index))) - diff --git a/code/ocr-list.py b/code/ocr-list.py deleted file mode 100644 index fa9a5dc..0000000 --- a/code/ocr-list.py +++ /dev/null @@ -1,75 +0,0 @@ -# remark by Jan : This script was mainly taken from the web as a template to use pytesseract. - - -# Import libraries -import PIL as pillow -from PIL import Image -import pytesseract -import sys -from pdf2image import convert_from_path -import os - -# Path of the pdf -PDF_file = "\\files\\participants-cop25.pdf" - -''' -Part #1 : Converting PDF to images -''' - -# Store all the pages of the PDF in a variable -pages = convert_from_path(PDF_file, dpi = 200) - -# Counter to store images of each page of PDF to image -image_counter = 1 - -print("Creating images out of the pdf") -# Iterate through all the pages stored above -for page in pages: - - # Declaring filename for each page of PDF as JPG - # For each page, filename will be: - # PDF page 1 -> page_1.jpg - # PDF page 2 -> page_2.jpg - # PDF page 3 -> page_3.jpg - # .... - # PDF page n -> page_n.jpg - filename = "page_"+str(image_counter)+".jpg" - - # Save the image of the page in system - page.save(filename, 'JPEG') - - # Increment the counter to update filename - image_counter = image_counter + 1 - -''' -Part #2 - Recognizing text from the images using OCR -''' -# Variable to get count of total number of pages -filelimit = image_counter-1 - -# Creating a text file to write the output -outfile = "out_text.txt" - -# Open the file in append mode so that -# All contents of all images are added to the same file -f = open(outfile, "a") - -# Iterate from 1 to total number of pages -for i in range(1, filelimit + 1): - print("Working on page " + str(i) + " out of " + str(filelimit)) - # Set filename to recognize text from - # Again, these files will be: - # page_1.jpg - # page_2.jpg - # .... - # page_n.jpg - filename = "page_"+str(i)+".jpg" - - # Recognize the text as string in image using pytesserct - text = str(((pytesseract.image_to_string(Image.open(filename))))) - - # Finally, write the processed text to the file. - f.write(text) - -# Close the file after writing all the text. -f.close() diff --git a/code/process_copX.py b/code/process_copX.py deleted file mode 100644 index bdcaab2..0000000 --- a/code/process_copX.py +++ /dev/null @@ -1,363 +0,0 @@ -### Jan Linder - - -# Import libraries -import pytesseract -import sys -from pdf2image import convert_from_path -import os -import re -import pandas as pd -from pathlib import Path -import PyPDF2 -import textract - - -# Constants -uppercase_abbrev = ["US", "USA", "AO", "UK", "WWF-US", "WWF-UK", "EPFL"] # These are not family names but abbreviations -salutory_addresses = ("Mr", "Ms", "Sr", "Sra", "H.E.", "S.E.", "M.", "Mme", "Dr.", "Drs.") # Must be a tuple for the funtion "startsWith" of String -default_startpage = [126, 2, 3, 3] # TODO add for copN > 4 -box_pages = [[], range(47, 60), [], []] - -SEPERATOR = "#" - - - - - - -# ----------------- Classes ----------------- - -class COP_Processor(): - def __init__(self, copN, intermediate_name, output_name): - self.copN = copN - self.intermediate_name = intermediate_name - self.output_name = output_name - - # startpage and endpage denote the first and last (exclusive) pages of the pdf to be included. Returns true if successful - def __doOCR(self, startpage, endpage): - # insert boxes that help for the OCR on the critical pages - print("Insert boxes where necessary") - box = "files\\column_block.pdf" - f_box = open(box, "rb") - pdf_box = PyPDF2.PdfFileReader(f_box) - - PDF_file = self.__getPDFpath() - file = open(PDF_file, "rb") - file_read = PyPDF2.PdfFileReader(file) - - output = PyPDF2.PdfFileWriter() - - for i in range(file_read.getNumPages()): - page = file_read.getPage(i) - # insert the boxes where necessary - if(i in box_pages[self.copN - 1]): - page.mergePage(pdf_box.getPage(0)) - output.addPage(page) - - boxed_file = open("boxed.pdf", "wb") - output.write(boxed_file) - f_box.close() - file.close() - boxed_file.close() - - # start of the OCR procedure - print("Create images from the pdf file") - #PDF_file = self.__getPDFpath() - PDF_file = "boxed.pdf" - pages = convert_from_path(PDF_file, dpi = 200) - - # check indexes - if startpage == 0 and endpage == 0: - # set to default page space - startpage = default_startpage[self.copN] - endpage = len(pages) - elif len(pages) < endpage or endpage < startpage or startpage <= 0: - return False - # correct the page numbers s.t. they begin counting from 0 and end is exclusive - startpage -= 1 - - - # Open the file in append mode so that - # All contents of all pages are added to the same file - f = open(self.intermediate_name, "w") - - # Iterate through all the pages stored above - for i in range(startpage, endpage): - print("Reading page " + str(i + 1)) - - # Recognize the text as string in image using pytesseract - text = str(((pytesseract.image_to_string(pages[i], config="--psm 3 --oem 1")))) - - # Write the processed text to the file. - f.write(text) - f.close() - - os.remove("boxed.pdf") - - - def __doPDFtoTxt(self): - """PDF_file = self.__getPDFpath() - f = open(PDF_file, "rb") - pdfReader = PyPDF2.PdfFileReader(f) - size = pdfReader.numPages - f = open(self.intermediate_name, "a") - for i in range(size): - f.write(pdfReader.getPage(i).extractText()) - f.close()""" - PDF_file = self.__getPDFpath() - text = textract.process(PDF_file, method='tesseract') - print(text) - f = open(self.intermediate_name, "a") - f.write(str(text)) - f.close() - return True - - - def __processCOP1(self): - file = open(self.intermediate_name, "r") - - print("Read file and replace some words") - entire_text = file.read() - entire_text = entire_text.replace('Affilliation/Country', '').replace('Name', '').replace('—','').replace('‘', '').replace('|', '').replace('(', '').replace('{', '').replace('[', '') - - # split it to a list - print("split the text up to a list") - entire_text_list = re.split(', | |\n', entire_text) - - # init constants - familyName = "" - firstName = "" - party = "" - i = 0 - - # remove the empty slots - print("remove the empty slots") - content_list = [el for el in entire_text_list if el] - - # the resulting dataframe - data = pd.DataFrame(columns = {"family name", "first name", "party"}) - - # fill in the data row by row - print("Extract the data") - list_size = len(content_list) - while i < list_size: - # go to the next word that is a last name (uppercase) - while not content_list[i].isupper(): - i += 1 - - familyName = content_list[i] - i += 1 - # if family name is more than one word - while content_list[i].isupper(): - familyName += " " + content_list[i] - i += 1 - - # assume that first name is only one word - firstName = content_list[i] - i += 1 - - # assume that the rest is the organization - while not (i >= list_size or (content_list[i].isupper() and not content_list[i] in uppercase_abbrev)): - party = party + " " + content_list[i] - i += 1 - - # append the new row to the dataframe - data = data.append({'family name': familyName, 'first name': firstName, 'party': party}, ignore_index = True) - # print the row - print(familyName + ", " + firstName + ": " + party) - - #clear the party - party = "" - - ##TODO output a csv file or similar - return True - - def __processCOP2to4(self): - file = open(self.intermediate_name, "r") - - print("Read file and replace some words") - entire_text = file.read() - entire_text = entire_text.replace(' ', '') - - # split it to a list - print("split the text up to a list") - content_list = re.split('\n', entire_text) - - # init constants - name = "" - description = "" - party = "" - i = 0 - - # remove the empty slots - #print("remove the empty slots") - #content_list = [el for el in entire_text_list if el] - - # the resulting dataframe - data = pd.DataFrame(columns = {"name", "party", "description"}) - - # fill in the data row by row - print("Extract the data") - list_size = len(content_list) - while i < list_size: - elem = content_list[i] - # check if it's a new party - if elem.isupper() and not elem in uppercase_abbrev: - # store the last person (if there is one) - if name != "": - data = data.append({'name': name, 'party': party, 'description': description}, ignore_index = True) - name = "" - description = "" - # set the new party - party = elem.lower() - # check if party is over two lines - j = 0 - while i + j < list_size and content_list[i+1].isupper(): - party += content_list[i+1].lower() - j += 1 - elif(elem.startswith(salutory_addresses)): - # a new person - # store the last person - if name != "": - data = data.append({'name': name, 'party': party, 'description': description}, ignore_index = True) - name = "" - description = "" - # set the new one - name = elem - elif elem != "": - # add it to the actual persons description - description += elem + "; " - - i += 1 - - print("do some analysis -------------------------------------------------------------") - # sort per party and count - byParty = data.groupby('party') - for organization, people in byParty: - print(organization) - print(people) - - print("The number of detected participants is " + str(len(data.index))) - - - return True - - def __processCOPnewer(self): - file = open(self.intermediate_name, "r") - - print("Read file and replace some words") - entire_text = file.read() - # replace the bad special characters here - # entire_text = entire_text.encode('utf-8') - ## TODO entire_text = entire_text.replace('\n', SEPERATOR).replace('\\x0c', '').replace(' (continued)', '').replace('(continued)', '') - entire_text = entire_text.replace('\\r\\n', SEPERATOR).replace('\\x0c', '').replace(' (continued)', '').replace('(continued)', '') - - # split it to a list - print("Split the text up to a list") - entire_list = re.split(SEPERATOR, entire_text) - - # ADDED FOR COP12: delete the empty spaces at beginning and at end of each element - size = len(entire_list) - for i in range(size): - while entire_list[i].startswith(' '): - entire_list[i] = entire_list[i][1:] - for i in range(size): - while entire_list[i].endswith(' '): - s = entire_list[i] - entire_list[i] = s[:len(s)-1] - - print(entire_list) - - # the resulting dataframe - data = pd.DataFrame(columns = {"name", "party", "description"}) - - # other variables - party = "" - name = "" - description = "" - - print("Extract the data") - i = 0 - # skip everything until the real participant list begins - while entire_list[i].lower() != "parties": - i += 1 - # extract the list - i += 1 - while i < size: - party = entire_list[i] - - i += 1 - # extract all the names of this party - while i < size and entire_list[i].startswith(salutory_addresses): - name = entire_list[i] - i += 1 - - # the rest is description - while i < size - 1 and entire_list[i] != "" and not entire_list[i].startswith(salutory_addresses): - description += entire_list[i] + " " - i += 1 - - # add the person to the dataframe - data = data.append({'name': name, 'party': party, 'description': description}, ignore_index = True) - description = "" - - while i < size and entire_list[i] == "": - i += 1 - - while i < size and entire_list[i] == "": - i += 1 - - print("do some analysis -------------------------------------------------------------") - # sort per party and count - byParty = data.groupby('party') - for organization, people in byParty: - print(organization) - print(people) - - print("The number of detected participants is " + str(len(data.index))) - return True - - def __getPDFpath(self): - return Path("files/participants-cop" + str(self.copN) + ".pdf") - - # The method that will be called from outside to convert the page - def pdfToData(self, startpage=0, endpage=0): - if self.copN <= 4 and self.copN > 0: - # use OCR if intermediate file does not exist yet - if not os.path.isfile(self.intermediate_name): - self.__doOCR(startpage, endpage) - # we now have it as a txt file - if self.copN == 1: - return self.__processCOP1() - else: - return self.__processCOP2to4() - else: - # use other method - if not os.path.isfile(self.intermediate_name): - self.__doPDFtoTxt() - # the data is now available as a csv file - return self.__processCOPnewer() - - -# ---------------------------------------------------------- -# format: process_copX ( ) -# the last option is given if the OCR has already been done (for cop 1 - 4) -# parse arguments -arguments = sys.argv -copNumber = int(arguments[1]) -intermediateFilename = arguments[2] -outputFilename = arguments[3] -startpage = 0 -endpage = 0 -if(len(arguments) == 6): - startpage = int(arguments[4]) - endpage = int(arguments[5]) - -proc = COP_Processor(copNumber, intermediateFilename, outputFilename) -success = proc.pdfToData(startpage, endpage) - -if success: - print("The data has successfully been extracted") -else: - print("The data couldn't be extracted correctly. Maybe this cop is not implemented yet.") \ No newline at end of file