Page MenuHomec4science

PdfExtractorFactory.py
No OneTemporary

File Metadata

Created
Sun, Jul 28, 01:25

PdfExtractorFactory.py

import os
from pathlib import Path
from partlistproc.DigitalPdfExtractor import DigitalPdfExtractor
from partlistproc.OcrExtractor import OcrExtractor
class PdfExtractorFactory():
""" finds the accurate PdfExtractor to use """
meetings_that_need_ocr = ["cop1", "cop2", "cop3", "cop4", "cop7", "cop8",
"sb1", "sb2", "sb4", "sb5", "sb6", "sb7", "sb10",
"sb12", "sb13"]
meetings_with_several_pdfs = ["cop11", "cop13", "cop14", "cop15", "cop16",
"cop17", "cop21", "cop22"]
meetings_with_corrigendum = [] # TODO
# normally, the first page of the list is this
default_startpage = 3
# for the meetings that are different, the start page is in this dict
custom_default_startpage = {"cop2": 2, "cop5": 2, "cop23": 2, "cop24": 2,
"cop25": 2, "sb1": 2, "sb2": 2, "sb4": 2,
"sb5": 2, "sb7": 2, "sb46": 2, "sb48": 2,
"sb50": 2}
# same for the end page
default_endpage = 0
custom_default_endpage = {"cop1": 126}
def __init__(self, label, output_file):
""" Constructor of this class
Args:
label (str): label of the meeting to process
output_file (string): name of the file to put the text in
"""
self.label = label
# check if we have list for this label
if not (os.path.isfile(DigitalPdfExtractor.getPDFpath(label)) or
os.path.isfile(DigitalPdfExtractor.getPDFpath(label, 1))):
raise ValueError("For this meeting, no PDF list is located in the data folder.")
self.output_file = output_file
def createPdfExtractor(self):
""" returns the accurate PdfExtractor that is correctly initialized
"""
startpage = self.custom_default_startpage.get(self.label,
self.default_startpage)
if self.label in self.meetings_that_need_ocr:
# Use OCR
endpage = self.custom_default_endpage.get(self.label,
self.default_endpage)
return OcrExtractor(self.label, self.output_file, startpage,
endpage)
else:
# Use PDF to txt
if self.label in self.meetings_with_several_pdfs:
# Pdf is splitted
i = 1
path = DigitalPdfExtractor.getPDFpath(self.label, i)
parts = 0
while os.path.isfile(path):
parts += 1
i += 1
path = DigitalPdfExtractor.getPDFpath(self.label, i)
return DigitalPdfExtractor(
DigitalPdfExtractor.getPDFpath(self.label),
self.output_file,
startpage,
list_parts=parts)
else:
# cop5 has a special structure
if self.label == "cop5":
return DigitalPdfExtractor(
DigitalPdfExtractor.getPDFpath(self.label),
self.output_file,
startpage,
column_tolerance=50)
extr.extract_text()
else:
# normal case: just one pdf
return DigitalPdfExtractor(
DigitalPdfExtractor.getPDFpath(self.label),
self.output_file,
startpage)
extr.extract_text()

Event Timeline