File Metadata

Created: Wed, Jul 2, 20:11

PdfExtractorFactory.py
View Options

	import os
	from pathlib import Path

	from partlistproc.DigitalPdfExtractor import DigitalPdfExtractor
	from partlistproc.OcrExtractor import OcrExtractor


	class PdfExtractorFactory():
	""" finds the accurate PdfExtractor to use """
	meetings_that_need_ocr = ["cop1", "cop2", "cop3", "cop4", "cop7", "cop8",
	"sb1", "sb2", "sb4", "sb5", "sb6", "sb7", "sb10",
	"sb12", "sb13"]
	meetings_with_several_pdfs = ["cop11", "cop13", "cop14", "cop15", "cop16",
	"cop17", "cop21", "cop22"]
	meetings_with_corrigendum = [] # TODO

	# normally, the first page of the list is this
	default_startpage = 3
	# for the meetings that are different, the start page is in this dict
	custom_default_startpage = {"cop2": 2, "cop5": 2, "cop23": 2, "cop24": 2,
	"cop25": 2, "sb1": 2, "sb2": 2, "sb4": 2,
	"sb5": 2, "sb7": 2, "sb46": 2, "sb48": 2,
	"sb50": 2}

	# same for the end page
	default_endpage = 0
	custom_default_endpage = {"cop1": 126}

	def __init__(self, label, output_file):
	""" Constructor of this class

	Args:
	label (str): label of the meeting to process
	output_file (string): name of the file to put the text in
	"""
	self.label = label
	# check if we have list for this label
	if not (os.path.isfile(DigitalPdfExtractor.getPDFpath(label)) or
	os.path.isfile(DigitalPdfExtractor.getPDFpath(label, 1))):
	raise ValueError("For this meeting, no PDF list is located in the data folder.")
	self.output_file = output_file

	def createPdfExtractor(self):
	""" returns the accurate PdfExtractor that is correctly initialized
	"""
	startpage = self.custom_default_startpage.get(self.label,
	self.default_startpage)
	if self.label in self.meetings_that_need_ocr:
	# Use OCR
	endpage = self.custom_default_endpage.get(self.label,
	self.default_endpage)
	return OcrExtractor(self.label, self.output_file, startpage,
	endpage)
	else:
	# Use PDF to txt
	if self.label in self.meetings_with_several_pdfs:
	# Pdf is splitted
	i = 1
	path = DigitalPdfExtractor.getPDFpath(self.label, i)
	parts = 0
	while os.path.isfile(path):
	parts += 1
	i += 1
	path = DigitalPdfExtractor.getPDFpath(self.label, i)

	return DigitalPdfExtractor(
	DigitalPdfExtractor.getPDFpath(self.label),
	self.output_file,
	startpage,
	list_parts=parts)
	else:
	# cop5 has a special structure
	if self.label == "cop5":
	return DigitalPdfExtractor(
	DigitalPdfExtractor.getPDFpath(self.label),
	self.output_file,
	startpage,
	column_tolerance=50)
	extr.extract_text()
	else:
	# normal case: just one pdf
	return DigitalPdfExtractor(
	DigitalPdfExtractor.getPDFpath(self.label),
	self.output_file,
	startpage)
	extr.extract_text()

PdfExtractorFactory.py
No OneTemporary
Actions

File Metadata

PdfExtractorFactory.py
View Options

Event Timeline

PdfExtractorFactory.pyNo OneTemporaryActions

File Metadata

PdfExtractorFactory.pyView Options

Event Timeline

PdfExtractorFactory.py
No OneTemporary
Actions

PdfExtractorFactory.py
View Options