diff --git a/code/lib/partlistproc/DigitalPdfExtractor.py b/code/lib/partlistproc/DigitalPdfExtractor.py index b422025..050e2e1 100644 --- a/code/lib/partlistproc/DigitalPdfExtractor.py +++ b/code/lib/partlistproc/DigitalPdfExtractor.py @@ -1,190 +1,210 @@ import functools +import pandas as pd from pdfminer.layout import LAParams from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser import partlistproc.MeetingAnalyzer as Ana from partlistproc import PDFPageDetailedAggregator from partlistproc.PdfExtractor import PdfExtractor # what is inserted in the txt for a line break new_page_marker = '' IDX_PAGENR = 0 IDX_X0 = 1 IDX_Y0 = 2 IDX_X1 = 3 IDX_Y1 = 4 IDX_ELEM = 5 class DigitalPdfExtractor(PdfExtractor): """ Class to represent an extractor that converts a normal pdf participant list into an .txt file """ def __init__(self, data_file, output_file, start_page, column_tolerance=5, - linebreak_tolerance=5, sameline_tolerance=0.5, list_parts=1): + linebreak_tolerance=5, sameline_tolerance=0.5, list_parts=1, + valid_affiliation_names_path=None): """ Constructor of the class Args: data_file (string): the PDF file to extract the text from output_file (string): the file to put the output text in, usually .txt start_page (int): the first page to extract of the pdf column_tolerance (int, optional): if two elements have more x0 distance, they're not on the same columns. Defaults to 5. linebreak_tolerance (int, optional): if 2 lines have more distance than this, insert a double line break between. Defaults to 5. sameline_tolerance (float, optional): the y0 difference that still accepts two element to be on same line. Defaults to 0.5. list_parts (int, optional): in how many pdfs the list is splitted. Defaults to 1. + valid_affiliation_names_path (str, optional): path to a csv file that contains + a dataframe with valid country names. Defaults to None. """ self.data_file = data_file self.output_file = output_file self.start_page = start_page self.column_tolerance = column_tolerance self.linebreak_tolerance = linebreak_tolerance self.sameline_tolerance = sameline_tolerance self.list_parts = list_parts + self.valid_affiliation_names_path = valid_affiliation_names_path # this dictionnary contains (page -> y0) for every affiliation category title self.category_dict = {} def extract_text(self): if self.list_parts == 1: # everything normal self.extract_text_of_pdf() else: base_name = self.data_file[:self.data_file.index(".pdf")] for i in range(1, self.list_parts + 1): print("Part " + str(i)) self.data_file = base_name + "-" + str(i) + ".pdf" self.extract_text_of_pdf() def extract_text_of_pdf(self): """ Overriding abstract method """ print("Extract the text from the pdf list using pdfminer.six") fp = open(self.data_file, 'rb') parser = PDFParser(fp) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) # receive the LTPage object for this page device.get_result() fp.close() # find the pages of the affiliation category titles with y-position rows = device.rows for row in rows: if (row[IDX_ELEM].lower()).startswith(Ana.MeetingAnalyzer.affiliation_categories): # this is a new title -> check if the first one on its page page = row[IDX_PAGENR] if page in self.category_dict: (self.category_dict[page]).append(row[IDX_Y0]) else: self.category_dict[page] = [row[IDX_Y0]] # device.rows now contain the text containers with location sorted_rows = sorted(rows, key=functools.cmp_to_key(self.make_comparator(self.lineComesFirst)), reverse=False) - # print(sorted_rows[:1000]) + + # To prevent bug that doesn't recognize affiliations at some points, + # import a list of valid country names + valid_affiliation_names = set() + if self.valid_affiliation_names_path is not None: + valid_affiliation_names = set((pd.read_csv(self.valid_affiliation_names_path, encoding="utf-8-sig"))["valid_affiliations"].values.tolist()) # insert line breaks as empty elements sorted_rows_with_linebreaks = [] prev = '' size = len(sorted_rows) for i, row in enumerate(sorted_rows): # first of all, ignore all the pages before the start page and the page numbers if not PdfExtractor.is_pagenumber(row[IDX_ELEM]) and row[IDX_PAGENR] >= self.start_page - 1: # normal case: in the same column - if (prev != '' and prev[IDX_PAGENR] == row[IDX_PAGENR] and abs(prev[IDX_X0] - row[IDX_X0]) < self.column_tolerance): + if (prev != '' and prev[IDX_PAGENR] == row[IDX_PAGENR] and + abs(prev[IDX_X0] - row[IDX_X0]) < self.column_tolerance): # compare the y values if prev[IDX_Y0] - row[IDX_Y1] > self.linebreak_tolerance: sorted_rows_with_linebreaks.append((0, 0, 0, 0, 0, '')) # affiliation category titles elif (prev != '' and prev[IDX_ELEM].lower().startswith(Ana.MeetingAnalyzer.affiliation_categories)): sorted_rows_with_linebreaks.append((0, 0, 0, 0, 0, '')) # also mark new pages elif (prev != '' and prev[IDX_PAGENR] < row[IDX_PAGENR]): sorted_rows_with_linebreaks.append((0, 0, 0, 0, 0, new_page_marker)) # new column: only mark when a new affiliation follows (no continuation of the person) elif (prev != '' and abs(prev[IDX_X0] - row[IDX_X0]) - >= self.column_tolerance - and i + 1 < size - and row[IDX_Y0] - sorted_rows[i + 1][IDX_Y1] - > self.linebreak_tolerance): - sorted_rows_with_linebreaks.append((0, 0, 0, 0, 0, '')) + >= self.column_tolerance): + # case 1: only one line on top of new column -> new aff + if (i + 1 < size + and row[IDX_Y0] - sorted_rows[i + 1][IDX_Y1] + > self.linebreak_tolerance): + sorted_rows_with_linebreaks.append((0, 0, 0, 0, 0, '')) + # case 2: matches a valid affiliation name + block_of_text = row[IDX_ELEM] + j = i + while (j + 1 < size and + sorted_rows[j][IDX_Y0] - sorted_rows[j + 1][IDX_Y1] + < self.linebreak_tolerance): + block_of_text += " " + sorted_rows[j + 1][IDX_ELEM] + j += 1 + if block_of_text.lower() in valid_affiliation_names: + sorted_rows_with_linebreaks.append((0, 0, 0, 0, 0, '')) sorted_rows_with_linebreaks.append(row) prev = row sorted_elems = [el[IDX_ELEM] for el in sorted_rows_with_linebreaks] # print the result to the txt file with open(self.output_file, "a", encoding="utf-8") as f: for row in sorted_elems: f.write("%s\n" % row) # clean the category dictionary self.category_dict = {} - - # my method def lineComesFirst(self, el1, el2): """ returns True if el1 comes before el2 in the document """ # compare pages if el1[IDX_PAGENR] == el2[IDX_PAGENR]: # if there is a category title on that page if el1[IDX_PAGENR] in self.category_dict: # check if one element is a new category -> compare only y0 if((el1[IDX_ELEM].lower()).startswith(Ana.MeetingAnalyzer.affiliation_categories) or (el2[IDX_ELEM].lower()).startswith(Ana.MeetingAnalyzer.affiliation_categories)): # if there on the same line, let it go under if abs(el1[IDX_Y0] - el2[IDX_Y0]) < self.sameline_tolerance: return (el1[IDX_ELEM].lower()).startswith(Ana.MeetingAnalyzer.affiliation_categories) else: return el1[IDX_Y0] > el2[IDX_Y0] # check if there in different categories -> compare only y0 borders = list(self.category_dict[el1[IDX_PAGENR]]) borders.append(el1[IDX_Y0]) borders.append(el2[IDX_Y0]) sorted_borders_and_els = sorted(borders) idx_diff = abs(sorted_borders_and_els.index(el1[IDX_Y0]) - sorted_borders_and_els.index(el2[IDX_Y0])) if idx_diff > 1: return el1[IDX_Y0] > el2[IDX_Y0] # otherwise, do the normal column check # check if their in the same column (x0 similar) if abs(el1[IDX_X0] - el2[IDX_X0]) <= self.column_tolerance: # same column -> y0 is decisive (grows from bottom of page) return el1[IDX_Y0] > el2[IDX_Y0] else: # different columns return el1[IDX_X0] < el2[IDX_X0] else: return el1[IDX_PAGENR] < el2[IDX_PAGENR] def make_comparator(self, less_than): def compare(x, y): if less_than(x, y): return -1 elif less_than(y, x): return 1 else: return 0 return compare diff --git a/code/lib/partlistproc/PdfExtractorFactory.py b/code/lib/partlistproc/PdfExtractorFactory.py index 6f93104..1373024 100644 --- a/code/lib/partlistproc/PdfExtractorFactory.py +++ b/code/lib/partlistproc/PdfExtractorFactory.py @@ -1,86 +1,90 @@ import os from pathlib import Path from partlistproc.DigitalPdfExtractor import DigitalPdfExtractor from partlistproc.OcrExtractor import OcrExtractor class PdfExtractorFactory(): """ finds the accurate PdfExtractor to use """ meetings_that_need_ocr = ["cop1", "cop2", "cop3", "cop4", "cop7", "cop8", "sb1", "sb2", "sb4", "sb5", "sb6", "sb7", "sb10", "sb12", "sb13"] meetings_with_several_pdfs = ["cop11", "cop13", "cop14", "cop15", "cop16", "cop17", "cop21", "cop22"] meetings_with_corrigendum = [] # TODO # normally, the first page of the list is this default_startpage = 3 # for the meetings that are different, the start page is in this dict custom_default_startpage = {"cop2": 2, "cop5": 2, "cop23": 2, "cop24": 2, "cop25": 2, "sb1": 2, "sb2": 2, "sb4": 2, "sb5": 2, "sb7": 2, "sb46": 2, "sb48": 2, "sb48b": 2, "sb50": 2} # same for the end page default_endpage = 0 custom_default_endpage = {"cop1": 126} - def __init__(self, label, output_file): + def __init__(self, label, output_file, valid_affiliation_names_path=None): """ Constructor of this class Args: label (str): label of the meeting to process - output_file (string): name of the file to put the text in + output_file (str): name of the file to put the text in + valid_affiliation_names_path (str, optional): path to a csv file that contains + a dataframe with valid country names. Defaults to None. """ self.label = label # check if we have list for this label if not (os.path.isfile(DigitalPdfExtractor.getPDFpath(label)) or os.path.isfile(DigitalPdfExtractor.getPDFpath(label, 1))): raise ValueError("For this meeting, no PDF list is located in the data folder.") self.output_file = output_file + self.valid_affiliation_names_path = valid_affiliation_names_path def createPdfExtractor(self): """ returns the accurate PdfExtractor that is correctly initialized """ startpage = self.custom_default_startpage.get(self.label, self.default_startpage) if self.label in self.meetings_that_need_ocr: # Use OCR endpage = self.custom_default_endpage.get(self.label, self.default_endpage) return OcrExtractor(self.label, self.output_file, startpage, endpage) else: # Use PDF to txt if self.label in self.meetings_with_several_pdfs: # Pdf is splitted i = 1 path = DigitalPdfExtractor.getPDFpath(self.label, i) parts = 0 while os.path.isfile(path): parts += 1 i += 1 path = DigitalPdfExtractor.getPDFpath(self.label, i) return DigitalPdfExtractor( DigitalPdfExtractor.getPDFpath(self.label), self.output_file, startpage, - list_parts=parts) + list_parts=parts, + valid_affiliation_names_path=self.valid_affiliation_names_path) else: # cop5 has a special structure if self.label == "cop5": return DigitalPdfExtractor( DigitalPdfExtractor.getPDFpath(self.label), self.output_file, startpage, - column_tolerance=50) - extr.extract_text() + column_tolerance=50, + valid_affiliation_names_path=self.valid_affiliation_names_path) else: # normal case: just one pdf return DigitalPdfExtractor( DigitalPdfExtractor.getPDFpath(self.label), self.output_file, - startpage) - extr.extract_text() + startpage, + valid_affiliation_names_path=self.valid_affiliation_names_path) diff --git a/code/scripts/extract_participants.py b/code/scripts/extract_participants.py index 38624b9..ed43420 100644 --- a/code/scripts/extract_participants.py +++ b/code/scripts/extract_participants.py @@ -1,41 +1,42 @@ """ The main script of the cop participants extraction. Takes as an argument the number of the cop to process. """ import os import sys import partlistproc from partlistproc.MeetingAnalyzerFactory import MeetingAnalyzerFactory from partlistproc.PdfExtractorFactory import PdfExtractorFactory txt_prefix = "../results/participants-txt/" csv_prefix = "../results/participants-csv/" default_intermediate_name = txt_prefix + "raw_X.txt" default_output_name = csv_prefix + "participants_X.csv" +valid_affiliation_names_path = "../data/dictionaries/valid_affiliation_names.csv" # format: # extract_participants_xopX.py # # the last option is given if the OCR has already been done (for cop 1 - 4) # parse arguments arguments = sys.argv label = arguments[1] intermediate_name = default_intermediate_name.replace("X", label) output_name = default_output_name.replace("X", label) if(len(arguments) > 2): intermediate_name = txt_prefix + arguments[2] output_name = csv_prefix + arguments[3] # First, extract the text from the pdf if not already done if not os.path.isfile(intermediate_name): - extr_factory = PdfExtractorFactory(label, intermediate_name) + extr_factory = PdfExtractorFactory(label, intermediate_name, valid_affiliation_names_path) extr = extr_factory.createPdfExtractor() extr.extract_text() # Second, extract the data from the text ana_factory = MeetingAnalyzerFactory(label, intermediate_name) ana = ana_factory.get_analyzer() ana.get_data(output_name) diff --git a/code/scripts/plots/plot_participant_graph.py b/code/scripts/plots/plot_participant_graph.py index af7f395..50172c8 100644 --- a/code/scripts/plots/plot_participant_graph.py +++ b/code/scripts/plots/plot_participant_graph.py @@ -1,106 +1,108 @@ import pandas as pd import matplotlib.pyplot as plt import json import networkx as nx def find_largest_parties(): country_file = open("../data/dictionaries/valid_countries.txt", "r") countries = country_file.readlines() countries = [c.replace("\n", "") for c in countries] complete_data = pd.read_csv("../results/complete_dataset.csv", encoding="utf-8-sig") parties = complete_data.loc[complete_data["affiliation_category"] == "parties"] parties = parties.loc[parties["affiliation"].apply(lambda x: x in countries)] total_nb_participants_per_country = dict() grouped_parties = parties.groupby("affiliation") for aff, people in grouped_parties: total_nb_participants_per_country[aff] = len(people) sorted_c = sorted(total_nb_participants_per_country.items(), key=lambda x: x[1], reverse=True) print(sorted_c) return [x[0] for x in sorted_c] def plot(path): LABEL_IDX = 0 NAME_IDX = 1 AFFILIATION_IDX = 2 CATEGORY_IDX = 3 + considered_meetings = ["cop10", "sb22", "cop11", "sb24", "cop12", "sb26", "cop13", "sb28", "cop14", "sb30", "cop15", "sb32", + "cop16", "sb34", "cop17", "sb36", "cop18", "sb38", "cop19", "sb40", "cop20", "sb42", "cop21", "sb44", + "cop22", "sb46", "cop23", "sb48", "sb48b", "cop24", "sb50", "cop25"] f = open(path, "r", encoding="utf-8") text = f.read() names = json.loads(text) # exclude the names that have an error (two names in the same meeting) names = {n: l for n, l in names.items() if len(set([m[0] for m in l])) == len(l)} country_file = open("../data/dictionaries/valid_countries.txt", "r") countries = country_file.readlines() countries = [c.replace("\n", "") for c in countries] max_set_n = len(countries) biggest_countries = find_largest_parties()[:max_set_n] # biggest_countries.append("european union") G = nx.Graph() G.clear() affiliations = set(biggest_countries) # TODO could just do it for the 40 countries that have the most participants G.add_nodes_from(biggest_countries) # TODO maybe add NGO's for name, list in names.items(): previous_affiliation = "" current_affiliation = "" for participation in list: - if participation[AFFILIATION_IDX] in countries: + if participation[LABEL_IDX] in considered_meetings: previous_affiliation = current_affiliation current_affiliation = participation[AFFILIATION_IDX] - if current_affiliation not in affiliations and len(affiliations) < max_set_n: - print(current_affiliation) + if current_affiliation not in affiliations: G.add_node(current_affiliation) affiliations.add(current_affiliation) if previous_affiliation in affiliations and current_affiliation in affiliations and previous_affiliation != "" and previous_affiliation != current_affiliation: if (previous_affiliation, current_affiliation) in G.edges: # increase weight if G[previous_affiliation][current_affiliation]["weight"] > 20: print(name) print(list) G[previous_affiliation][current_affiliation]["weight"] += 1 else: G.add_edge(previous_affiliation, current_affiliation, weight=1) # nodes print("Sorted nodes") highest_nodes = [x[0] for x in sorted(G.degree(weight='weight'), key=lambda x: x[1], reverse=True)] print(highest_nodes[:40]) G = G.subgraph(highest_nodes[:40]) # find the largest weight for the resizing of the egdes sorted_edges = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True) print("biggest edges") print(sorted_edges[:20]) max_weight = sorted_edges[0][2]["weight"] print(max_weight) pos = nx.circular_layout(G) nx.draw_networkx_nodes( G, pos, node_size=2, ) - nx.draw_networkx_labels(G, pos, font_color="black", font_weight="bold") + nx.draw_networkx_labels(G, pos, font_color="black", font_size=8, font_weight="bold") for edge in G.edges(data='weight'): nx.draw_networkx_edges(G, pos, edgelist=[edge], edge_color= (0/256.0, 162/256.0, 240/256.0, 1), width=edge[2]/max_weight*10) """plt.subplot(122) nx.draw_shell(G, nlist=[range(5, 10), range(5)], with_labels=True, font_weight='bold')""" """print(G.nodes()) nx.draw(G)""" - """plt.xlim(-0.05, 1.05) - plt.ylim(-0.05, 1.05)""" + plt.xlim(-1.25, 1.25) + plt.ylim(-1.1, 1.1) plt.axis("off") plt.show() \ No newline at end of file diff --git a/report/conclusion.tex b/report/conclusion.tex index 2ab31aa..ed86ca8 100644 --- a/report/conclusion.tex +++ b/report/conclusion.tex @@ -1,5 +1,5 @@ \section{Conclusion} \subsection{Critics on methodology} -% mention errors of OCR, for example cop 7 (marocco in france) \ No newline at end of file +% mention errors of OCR, for example cop 7 (marocco in france) \ No newline at end of file