pdf.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, May 20, 13:42

pdf.py
View Options

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	""" This class can be used to represent a pdf.
	"""
	# Copyright (C) University of Basel 2019 {{{1
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}

	__author__ = "Christian Steiner"
	__maintainer__ = __author__
	__copyright__ = 'University of Basel'
	__email__ = "christian.steiner@unibas.ch"
	__status__ = "Development"
	__license__ = "GPL v3"
	__version__ = "0.0.1"

	import lxml.etree as ET
	from pdfminer.pdfparser import PDFParser
	from pdfminer.pdfdocument import PDFDocument
	from pdfminer.pdfpage import PDFPage
	from pdfminer.pdfpage import PDFTextExtractionNotAllowed
	from pdfminer.pdfinterp import PDFResourceManager
	from pdfminer.pdfinterp import PDFPageInterpreter
	from pdfminer.pdfdevice import PDFDevice
	from pdfminer.layout import LAParams
	from pdfminer.converter import PDFPageAggregator
	import pdfminer
	from progress.bar import Bar
	import re
	import warnings

	from os import path
	from os.path import isfile, sep

	from .positional_object import PositionalObject
	from .matrix import Matrix

	class PDFText:
	"""This class represents a pdf and extracts text from it.

	Args:
	pdfFile (str): the pdf file name.
	current_page_number (int) the current page of the pdf.
	"""
	UNITTESTING = False
	def __init__(self, pdfFile, current_page_number=0, sonderzeichen=[]):
	self.pdfFile = pdfFile
	self.sonderzeichen = [ '', ' ' ] if len(sonderzeichen) == 0\
	else [ '', ' ' ] + sonderzeichen + [ a + b for a in sonderzeichen for b in sonderzeichen ]
	fp = open(self.pdfFile, 'rb')
	document = PDFDocument(PDFParser(fp))
	if not document.is_extractable:
	raise PDFTextExtractionNotAllowed
	self.current_page_number = current_page_number
	self.text_tree = ET.ElementTree(ET.Element('pdf'))
	pages = [ page for page in PDFPage.create_pages(document)]
	if len(pages) > self.current_page_number:
	self.current_page = pages[self.current_page_number]
	rsrcmgr = PDFResourceManager()
	device = PDFDevice(rsrcmgr)
	laparams = LAParams()
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	interpreter.process_page(self.current_page)
	layout = device.get_result()
	for obj in layout._objs:
	if isinstance(obj, pdfminer.layout.LTText):
	id = len(self.text_tree.xpath('.//text'))
	text_node = ET.SubElement(self.text_tree.getroot(), 'text',\
	attrib={'id': str(id),\
	'xmin': str(round(obj.bbox[0], 3)), 'ymin': str(round(obj.bbox[1], 3)), 'xmax': str(round(obj.bbox[2], 3)), 'ymax': str(round(obj.bbox[3], 3))})
	text_node.text = obj.get_text().replace('\n', '')
	fp.close()
	else:
	fp.close()
	raise Exception('File {} does not contain page number {}'.format(self.pdfFile, self.current_page_number))

	def tree_contains_text_at(self, text, left, bottom):
	"""Returns whether tree contains the text at the specified position.
	"""
	OFFSET = 3
	x = left + OFFSET
	y = self.current_page.attrs['MediaBox'][3] - bottom + OFFSET
	return len(self.text_tree.xpath(\
	".//text[contains(., '{0}') and @xmin<={1} and @xmax>={1} and @ymin<={2} and @ymax>={2}]".format(text, x, y))\
	) > 0

	def tree_contains_text(self, text):
	"""Returns whether tree contains the text at the specified position.
	"""
	return len(self.text_tree.xpath(".//text[contains(., '{0}')]".format(text))) > 0

	def split_str_according_to_pdf_tree(self, text):
	"""Returns the string that has been found in the tree
	"""
	if self.tree_contains_text(text):
	return text
	elif self.tree_contains_text(text[1:]):
	return text[1:]
	elif self.tree_contains_text(text[:len(text)-1]):
	return text[:len(text)-1]
	elif self.tree_contains_text(text[1:len(text)-1]):
	return text[1:len(text)-1]
	else:
	return ''

	def split_wrongly_concatenated_words(self, page):
	"""Test for falsely concatenated words and split them

	[:returns:] an updated Array of all (datatypes.word) Words
	"""
	new_words = []
	for word in page.words: # test for falsely concatenated words and split them
	if self.tree_contains_text(word.text):
	new_words.append(word)
	else:
	index = len(word.text)
	word_found = False
	while not word_found and index > 0:
	result = self.split_str_according_to_pdf_tree(word.text[:index])
	if len(result) > 0:
	word_found = True
	previousWord, currentWord, nextWord = word.split(result, start_id=len(page.words))
	if previousWord is not None:
	new_words.append(previousWord)
	new_words.append(currentWord)
	if nextWord is not None:
	new_words.append(nextWord)
	else:
	index -= 1
	if not word_found:
	warnings.warn('ATTENTION: Word not found: {} on line {}: {}!'.format(word.id, word.line_number, word.text))
	return new_words

	def get_previous_word2join(self, word2join, page, transkription_field=None):
	"""Finds previous word to word2join and returns it after testing if joined word is on pdf.
	"""
	previousWord = None
	previousWord_index = 0
	THRESHOLD = 1.5
	LEFTDIFF = 100
	# a reverse sorted list of words that are left to word2join -> first item should be word to join.
	previous_word_list = sorted([ word for word in page.words\
	if word.line_number == word2join.line_number\
	and word.transkription_positions[len(word.transkription_positions)-1].left < word2join.transkription_positions[0].left\
	and abs(word.transkription_positions[len(word.transkription_positions)-1].bottom-word2join.transkription_positions[0].bottom) < THRESHOLD],\
	key=lambda word: word.transkription_positions[0].left, reverse=True)
	if word2join.line_number == -1 or True in [ (position.transform is not None) for position in word2join.transkription_positions ]:
	previous_word_list = sorted([ word for word in page.words\
	if abs(word.transkription_positions[len(word.transkription_positions)-1].bottom-word2join.transkription_positions[0].bottom) < THRESHOLD\
	and abs(word.transkription_positions[len(word.transkription_positions)-1].left-word2join.transkription_positions[0].left) < LEFTDIFF\
	and word.transkription_positions[len(word.transkription_positions)-1].left < word2join.transkription_positions[0].left],\
	key=lambda word: word.transkription_positions[0].left, reverse=True)
	#print('{}/{}: {} ->{}'.format(word2join.line_number, word2join.id, word2join.text, '#'.join([word.text for word in previous_word_list])))
	while previousWord is None and previousWord_index < len(previous_word_list):
	currentWord = previous_word_list[previousWord_index]
	left = currentWord.transkription_positions[0].left + transkription_field.xmin\
	if transkription_field is not None else currentWord.transkription_positions[0].left
	bottom = currentWord.transkription_positions[0].bottom + transkription_field.ymin\
	if transkription_field is not None else currentWord.transkription_positions[0].bottom
	text_list = [ currentWord.text + sonderzeichen + word2join.text for sonderzeichen in self.sonderzeichen ]
	if True in [ self.tree_contains_text_at(text, left, bottom) for text in text_list ]:
	previousWord = currentWord
	previousWord_index += 1
	return previousWord

	def get_next_word2join(self, word2join, page, transkription_field=None):
	"""Finds next word to join word2join and returns if after testing if joined word is on pdf.
	"""
	nextWord = None
	nextWord_index = 0
	THRESHOLD = 1.5
	LEFTDIFF = 100
	# a sorted list of words that are right to word2join -> first item should be word to join.
	next_word_list = sorted([ word for word in page.words\
	if word.line_number == word2join.line_number\
	and word.transkription_positions[0].left > word2join.transkription_positions[0].left\
	and abs(word.transkription_positions[0].bottom-word2join.transkription_positions[0].bottom) < THRESHOLD],\
	key=lambda word: word.transkription_positions[0].left)
	if word2join.line_number == -1 or True in [ (position.transform is not None) for position in word2join.transkription_positions ]:
	next_word_list = sorted([ word for word in page.words\
	if abs(word.transkription_positions[0].bottom-word2join.transkription_positions[0].bottom) < THRESHOLD\
	and abs(word.transkription_positions[0].left-word2join.transkription_positions[len(word2join.transkription_positions)-1].left) < LEFTDIFF\
	and word.transkription_positions[0].left > word2join.transkription_positions[len(word2join.transkription_positions)-1].left ],\
	key=lambda word: word.transkription_positions[0].left)

	while nextWord is None and nextWord_index < len(next_word_list):
	currentWord = next_word_list[nextWord_index]
	left = currentWord.transkription_positions[0].left + transkription_field.xmin\
	if transkription_field is not None else currentWord.transkription_positions[0].left
	bottom = currentWord.transkription_positions[0].bottom + transkription_field.ymin\
	if transkription_field is not None else currentWord.transkription_positions[0].bottom
	text_list = [ word2join.text + sonderzeichen + currentWord.text for sonderzeichen in self.sonderzeichen ]
	if True in [ self.tree_contains_text_at(text, left, bottom) for text in text_list ]:
	nextWord = currentWord
	nextWord_index += 1
	return nextWord

	def add_punctuation2words(self, page, transkription_field=None):
	"""Join words that consist of punctuation only to words.
	"""
	punctuation_pattern = r'^[.,:;?]$'
	punctuation_words = [ word for word in page.words if re.match(punctuation_pattern, word.text) ]
	showing_bar = not PDFText.UNITTESTING and not len(punctuation_words) < 10
	if showing_bar:
	bar = Bar('Joining punctuations with words', max=len(punctuation_words))
	for punctuation_word in punctuation_words:
	showing_bar and bar.next()
	previousWord = self.get_previous_word2join(punctuation_word, page, transkription_field=transkription_field)
	if previousWord is not None:
	previousWord.join(punctuation_word)
	page.words.remove(punctuation_word)
	showing_bar and bar.finish()

	def join_composita(self, page, transkription_field=None):
	"""Joins composita.
	"""
	connection_words = [ word for word in page.words if re.match(r'^[-=]$', word.text) ]
	showing_bar = not PDFText.UNITTESTING and not len(connection_words) < 10
	if showing_bar:
	bar = Bar('Joining composita', max=len(connection_words))
	for connection_word in connection_words:
	showing_bar and bar.next()
	previousWord = self.get_previous_word2join(connection_word, page, transkription_field=transkription_field)
	nextWord = self.get_next_word2join(connection_word, page, transkription_field=transkription_field)
	if previousWord is not None:
	previousWord.join(connection_word)
	page.words.remove(connection_word)
	if nextWord is not None:
	previousWord.join(nextWord)
	page.words.remove(nextWord)
	elif nextWord is not None:
	connection_word.join(nextWord)
	page.words.remove(nextWord)
	composita_pattern = r'^[=-]\s*[A-Z]'
	for composita_word in [ word for word in page.words if re.match(composita_pattern, word.text) ]:
	previousWord = self.get_previous_word2join(composita_word, page, transkription_field=transkription_field)
	if previousWord is not None:
	previousWord.join(composita_word)
	page.words.remove(composita_word)
	showing_bar and bar.finish()

	def find_word_path(self, words_on_current_line, path=[]):
	"""Finds the words that form a path above or beneath words on the same uneven line.

	[:return:] a list of word that belong to this path in the proper order.
	"""
	if len(words_on_current_line) < 2:
	return path
	THRESHOLD = 1.5
	words_on_path = []
	words_on_current_line = sorted(words_on_current_line, key=lambda word: word.transkription_positions[0].left)
	first_single_char_index = [ bool(re.match(r'^\w$', word.text)) for word in words_on_current_line ].index(True)
	current_word = words_on_current_line[first_single_char_index]
	transform_direction = Matrix.STRAIGHT if current_word.transkription_positions[0].transform is None\
	else current_word.transkription_positions[0].transform.get_rotation_direction()
	# look left
	index = 1
	start_found = False
	current_text = current_word.text
	while first_single_char_index-index >= 0 and not start_found:
	left_word = words_on_current_line[first_single_char_index-index]
	if abs(left_word.transkription_positions[len(left_word.transkription_positions)-1].bottom-current_word.transkription_positions[0].bottom) < THRESHOLD\
	or (transform_direction*-1 == Matrix.DOWN\
	and left_word.transkription_positions[len(left_word.transkription_positions)-1].bottom < current_word.transkription_positions[0].bottom)\
	or (transform_direction*-1 == Matrix.UP \
	and left_word.transkription_positions[len(left_word.transkription_positions)-1].bottom > current_word.transkription_positions[0].bottom):
	if self.tree_contains_text(left_word.text + current_text):
	current_text = left_word.text + current_text
	words_on_path.insert(0, left_word)
	elif self.tree_contains_text(left_word.text + ' ' + current_text):
	current_text = left_word.text + ' ' + current_text
	words_on_path.insert(0, left_word)
	else:
	start_found = True
	current_word = left_word
	transform_direction = Matrix.STRAIGHT if current_word.transkription_positions[0].transform is None\
	else current_word.transkription_positions[0].transform.get_rotation_direction()
	else:
	start_found = True
	index += 1
	current_word = words_on_current_line[first_single_char_index]
	transform_direction = Matrix.STRAIGHT if current_word.transkription_positions[0].transform is None\
	else current_word.transkription_positions[0].transform.get_rotation_direction()
	words_on_path.append(current_word)
	# look right
	index = 1
	end_found = False
	while first_single_char_index+index < len(words_on_current_line) and not end_found:
	right_word = words_on_current_line[first_single_char_index+index]
	if abs(right_word.transkription_positions[len(right_word.transkription_positions)-1].bottom-current_word.transkription_positions[0].bottom) < THRESHOLD\
	or (transform_direction == Matrix.DOWN\
	and right_word.transkription_positions[0].bottom < current_word.transkription_positions[len(current_word.transkription_positions)-1].bottom)\
	or (transform_direction == Matrix.UP \
	and right_word.transkription_positions[0].bottom > current_word.transkription_positions[len(current_word.transkription_positions)-1].bottom):
	if self.tree_contains_text(current_text + right_word.text):
	current_text = current_text + right_word.text
	words_on_path.append(right_word)
	elif self.tree_contains_text(current_text + ' ' + right_word.text):
	current_text = current_text + ' ' + right_word.text
	words_on_path.append(right_word)
	else:
	end_found = True
	current_word = right_word
	transform_direction = Matrix.STRAIGHT if current_word.transkription_positions[0].transform is None\
	else current_word.transkription_positions[0].transform.get_rotation_direction()
	else:
	end_found = True
	index += 1
	path = path + words_on_path
	index -= 1
	first_single_char_index += index
	if first_single_char_index < len(words_on_current_line)\
	and True in [ bool(re.match(r'^\w$', word.text)) for word in words_on_current_line[first_single_char_index:] ]:
	return self.find_word_path(words_on_current_line[first_single_char_index:], path=path)
	else:
	return path

	def join_single_char_words(self, page, transkription_field=None):
	"""Joins words that consist of single chars if joined words are on pdf.
	"""
	self.sonderzeichen.remove(' ')
	index = 0
	single_char_words = [ word for word in page.words if re.match(r'^\w$', word.text) ]
	# first check for word path going above words on the same uneven line
	for line_number in sorted(set(word.line_number for word in single_char_words\
	if (word.line_number % 2 == 1 and word.line_number > 0))):
	words_on_current_line = [ word for word in page.words if word.line_number == line_number ]
	if True in [ PositionalObject.POSITIONS_ARE_STACKED(a.transkription_positions[0], b.transkription_positions[0])\
	for a in words_on_current_line\
	for b in words_on_current_line\
	if a != b]:
	word_path = self.find_word_path(words_on_current_line)
	previousWord = None
	for word in word_path:
	if previousWord is not None\
	and PositionalObject.POSITIONS_OVERLAP_HORIZONTALLY(\
	previousWord.transkription_positions[len(previousWord.transkription_positions)-1], word.transkription_positions[0]):
	previousWord.join(word)
	page.words.remove(word)
	else:
	previousWord = word
	###TODO: this works only if we get the right spacing for each individual letter, look it up in svg path file
	#print([word.text for word in single_char_words if word.line_number == -1])
	showing_bar = not PDFText.UNITTESTING and not len(single_char_words) < 10
	if showing_bar:
	bar = Bar('Joining single char words', max=len(single_char_words))
	while index < len(single_char_words):
	showing_bar and bar.next()
	if single_char_words[index] in page.words:
	currentWord = single_char_words[index]
	previousWord = self.get_previous_word2join(currentWord, page, transkription_field=transkription_field)
	if previousWord is not None:
	previousWord.join(currentWord)
	page.words.remove(currentWord)
	currentWord = previousWord
	nextWord = self.get_next_word2join(currentWord, page, transkription_field=transkription_field)
	while nextWord is not None:
	currentWord.join(nextWord)
	page.words.remove(nextWord)
	nextWord = self.get_next_word2join(currentWord, page, transkription_field=transkription_field)
	index += 1
	showing_bar and bar.finish()

	def compare_svgWords2pdfWords(self, page, transkription_field=None, split_wrongly_concatenated_words=False):
	""" Compares each word to the word of the pdf and splits or joins them.
	"""
	if split_wrongly_concatenated_words:
	page.words = self.split_wrongly_concatenated_words(page)
	self.add_punctuation2words(page, transkription_field=transkription_field)
	self.join_composita(page, transkription_field=transkription_field)
	self.join_single_char_words(page, transkription_field=transkription_field)

pdf.pyNo OneTemporaryActions

File Metadata

pdf.pyView Options

Event Timeline

pdf.py
No OneTemporary
Actions

pdf.py
View Options