Page MenuHomec4science

pdf.py
No OneTemporary

File Metadata

Created
Fri, May 17, 23:46
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a pdf.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import lxml.etree as ET
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
from progress.bar import Bar
import re
import warnings
from os import path
from os.path import isfile, sep
from .positional_object import PositionalObject
from .matrix import Matrix
class PDFText:
"""This class represents a pdf and extracts text from it.
Args:
pdfFile (str): the pdf file name.
current_page_number (int) the current page of the pdf.
"""
UNITTESTING = False
def __init__(self, pdfFile, current_page_number=0, sonderzeichen=[]):
self.pdfFile = pdfFile
self.sonderzeichen = [ '', ' ' ] if len(sonderzeichen) == 0\
else [ '', ' ' ] + sonderzeichen + [ a + b for a in sonderzeichen for b in sonderzeichen ]
fp = open(self.pdfFile, 'rb')
document = PDFDocument(PDFParser(fp))
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
self.current_page_number = current_page_number
self.text_tree = ET.ElementTree(ET.Element('pdf'))
pages = [ page for page in PDFPage.create_pages(document)]
if len(pages) > self.current_page_number:
self.current_page = pages[self.current_page_number]
rsrcmgr = PDFResourceManager()
device = PDFDevice(rsrcmgr)
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(self.current_page)
layout = device.get_result()
for obj in layout._objs:
if isinstance(obj, pdfminer.layout.LTText):
id = len(self.text_tree.xpath('.//text'))
text_node = ET.SubElement(self.text_tree.getroot(), 'text',\
attrib={'id': str(id),\
'xmin': str(round(obj.bbox[0], 3)), 'ymin': str(round(obj.bbox[1], 3)), 'xmax': str(round(obj.bbox[2], 3)), 'ymax': str(round(obj.bbox[3], 3))})
text_node.text = obj.get_text().replace('\n', '')
fp.close()
else:
fp.close()
raise Exception('File {} does not contain page number {}'.format(self.pdfFile, self.current_page_number))
def tree_contains_text_at(self, text, left, bottom):
"""Returns whether tree contains the text at the specified position.
"""
OFFSET = 3
x = left + OFFSET
y = self.current_page.attrs['MediaBox'][3] - bottom + OFFSET
return len(self.text_tree.xpath(\
".//text[contains(., '{0}') and @xmin<={1} and @xmax>={1} and @ymin<={2} and @ymax>={2}]".format(text, x, y))\
) > 0
def tree_contains_text(self, text):
"""Returns whether tree contains the text at the specified position.
"""
return len(self.text_tree.xpath(".//text[contains(., '{0}')]".format(text))) > 0
def split_str_according_to_pdf_tree(self, text):
"""Returns the string that has been found in the tree
"""
if self.tree_contains_text(text):
return text
elif self.tree_contains_text(text[1:]):
return text[1:]
elif self.tree_contains_text(text[:len(text)-1]):
return text[:len(text)-1]
elif self.tree_contains_text(text[1:len(text)-1]):
return text[1:len(text)-1]
else:
return ''
def split_wrongly_concatenated_words(self, page):
"""Test for falsely concatenated words and split them
[:returns:] an updated Array of all (datatypes.word) Words
"""
new_words = []
for word in page.words: # test for falsely concatenated words and split them
if self.tree_contains_text(word.text):
new_words.append(word)
else:
index = len(word.text)
word_found = False
while not word_found and index > 0:
result = self.split_str_according_to_pdf_tree(word.text[:index])
if len(result) > 0:
word_found = True
previousWord, currentWord, nextWord = word.split(result, start_id=len(page.words))
if previousWord is not None:
new_words.append(previousWord)
new_words.append(currentWord)
if nextWord is not None:
new_words.append(nextWord)
else:
index -= 1
if not word_found:
warnings.warn('ATTENTION: Word not found: {} on line {}: {}!'.format(word.id, word.line_number, word.text))
return new_words
def get_previous_word2join(self, word2join, page, transkription_field=None):
"""Finds previous word to word2join and returns it after testing if joined word is on pdf.
"""
previousWord = None
previousWord_index = 0
THRESHOLD = 1.5
LEFTDIFF = 100
# a reverse sorted list of words that are left to word2join -> first item should be word to join.
previous_word_list = sorted([ word for word in page.words\
if word.line_number == word2join.line_number\
and word.transkription_positions[len(word.transkription_positions)-1].left < word2join.transkription_positions[0].left\
and abs(word.transkription_positions[len(word.transkription_positions)-1].bottom-word2join.transkription_positions[0].bottom) < THRESHOLD],\
key=lambda word: word.transkription_positions[0].left, reverse=True)
if word2join.line_number == -1 or True in [ (position.transform is not None) for position in word2join.transkription_positions ]:
previous_word_list = sorted([ word for word in page.words\
if abs(word.transkription_positions[len(word.transkription_positions)-1].bottom-word2join.transkription_positions[0].bottom) < THRESHOLD\
and abs(word.transkription_positions[len(word.transkription_positions)-1].left-word2join.transkription_positions[0].left) < LEFTDIFF\
and word.transkription_positions[len(word.transkription_positions)-1].left < word2join.transkription_positions[0].left],\
key=lambda word: word.transkription_positions[0].left, reverse=True)
#print('{}/{}: {} ->{}'.format(word2join.line_number, word2join.id, word2join.text, '#'.join([word.text for word in previous_word_list])))
while previousWord is None and previousWord_index < len(previous_word_list):
currentWord = previous_word_list[previousWord_index]
left = currentWord.transkription_positions[0].left + transkription_field.xmin\
if transkription_field is not None else currentWord.transkription_positions[0].left
bottom = currentWord.transkription_positions[0].bottom + transkription_field.ymin\
if transkription_field is not None else currentWord.transkription_positions[0].bottom
text_list = [ currentWord.text + sonderzeichen + word2join.text for sonderzeichen in self.sonderzeichen ]
if True in [ self.tree_contains_text_at(text, left, bottom) for text in text_list ]:
previousWord = currentWord
previousWord_index += 1
return previousWord
def get_next_word2join(self, word2join, page, transkription_field=None):
"""Finds next word to join word2join and returns if after testing if joined word is on pdf.
"""
nextWord = None
nextWord_index = 0
THRESHOLD = 1.5
LEFTDIFF = 100
# a sorted list of words that are right to word2join -> first item should be word to join.
next_word_list = sorted([ word for word in page.words\
if word.line_number == word2join.line_number\
and word.transkription_positions[0].left > word2join.transkription_positions[0].left\
and abs(word.transkription_positions[0].bottom-word2join.transkription_positions[0].bottom) < THRESHOLD],\
key=lambda word: word.transkription_positions[0].left)
if word2join.line_number == -1 or True in [ (position.transform is not None) for position in word2join.transkription_positions ]:
next_word_list = sorted([ word for word in page.words\
if abs(word.transkription_positions[0].bottom-word2join.transkription_positions[0].bottom) < THRESHOLD\
and abs(word.transkription_positions[0].left-word2join.transkription_positions[len(word2join.transkription_positions)-1].left) < LEFTDIFF\
and word.transkription_positions[0].left > word2join.transkription_positions[len(word2join.transkription_positions)-1].left ],\
key=lambda word: word.transkription_positions[0].left)
while nextWord is None and nextWord_index < len(next_word_list):
currentWord = next_word_list[nextWord_index]
left = currentWord.transkription_positions[0].left + transkription_field.xmin\
if transkription_field is not None else currentWord.transkription_positions[0].left
bottom = currentWord.transkription_positions[0].bottom + transkription_field.ymin\
if transkription_field is not None else currentWord.transkription_positions[0].bottom
text_list = [ word2join.text + sonderzeichen + currentWord.text for sonderzeichen in self.sonderzeichen ]
if True in [ self.tree_contains_text_at(text, left, bottom) for text in text_list ]:
nextWord = currentWord
nextWord_index += 1
return nextWord
def add_punctuation2words(self, page, transkription_field=None):
"""Join words that consist of punctuation only to words.
"""
punctuation_pattern = r'^[.,:;?]$'
punctuation_words = [ word for word in page.words if re.match(punctuation_pattern, word.text) ]
showing_bar = not PDFText.UNITTESTING and not len(punctuation_words) < 10
if showing_bar:
bar = Bar('Joining punctuations with words', max=len(punctuation_words))
for punctuation_word in punctuation_words:
showing_bar and bar.next()
previousWord = self.get_previous_word2join(punctuation_word, page, transkription_field=transkription_field)
if previousWord is not None:
previousWord.join(punctuation_word)
page.words.remove(punctuation_word)
showing_bar and bar.finish()
def join_composita(self, page, transkription_field=None):
"""Joins composita.
"""
connection_words = [ word for word in page.words if re.match(r'^[-=]$', word.text) ]
showing_bar = not PDFText.UNITTESTING and not len(connection_words) < 10
if showing_bar:
bar = Bar('Joining composita', max=len(connection_words))
for connection_word in connection_words:
showing_bar and bar.next()
previousWord = self.get_previous_word2join(connection_word, page, transkription_field=transkription_field)
nextWord = self.get_next_word2join(connection_word, page, transkription_field=transkription_field)
if previousWord is not None:
previousWord.join(connection_word)
page.words.remove(connection_word)
if nextWord is not None:
previousWord.join(nextWord)
page.words.remove(nextWord)
elif nextWord is not None:
connection_word.join(nextWord)
page.words.remove(nextWord)
composita_pattern = r'^[=-]\s*[A-Z]'
for composita_word in [ word for word in page.words if re.match(composita_pattern, word.text) ]:
previousWord = self.get_previous_word2join(composita_word, page, transkription_field=transkription_field)
if previousWord is not None:
previousWord.join(composita_word)
page.words.remove(composita_word)
showing_bar and bar.finish()
def find_word_path(self, words_on_current_line, path=[]):
"""Finds the words that form a path above or beneath words on the same uneven line.
[:return:] a list of word that belong to this path in the proper order.
"""
if len(words_on_current_line) < 2:
return path
THRESHOLD = 1.5
words_on_path = []
words_on_current_line = sorted(words_on_current_line, key=lambda word: word.transkription_positions[0].left)
first_single_char_index = [ bool(re.match(r'^\w$', word.text)) for word in words_on_current_line ].index(True)
current_word = words_on_current_line[first_single_char_index]
transform_direction = Matrix.STRAIGHT if current_word.transkription_positions[0].transform is None\
else current_word.transkription_positions[0].transform.get_rotation_direction()
# look left
index = 1
start_found = False
current_text = current_word.text
while first_single_char_index-index >= 0 and not start_found:
left_word = words_on_current_line[first_single_char_index-index]
if abs(left_word.transkription_positions[len(left_word.transkription_positions)-1].bottom-current_word.transkription_positions[0].bottom) < THRESHOLD\
or (transform_direction*-1 == Matrix.DOWN\
and left_word.transkription_positions[len(left_word.transkription_positions)-1].bottom < current_word.transkription_positions[0].bottom)\
or (transform_direction*-1 == Matrix.UP \
and left_word.transkription_positions[len(left_word.transkription_positions)-1].bottom > current_word.transkription_positions[0].bottom):
if self.tree_contains_text(left_word.text + current_text):
current_text = left_word.text + current_text
words_on_path.insert(0, left_word)
elif self.tree_contains_text(left_word.text + ' ' + current_text):
current_text = left_word.text + ' ' + current_text
words_on_path.insert(0, left_word)
else:
start_found = True
current_word = left_word
transform_direction = Matrix.STRAIGHT if current_word.transkription_positions[0].transform is None\
else current_word.transkription_positions[0].transform.get_rotation_direction()
else:
start_found = True
index += 1
current_word = words_on_current_line[first_single_char_index]
transform_direction = Matrix.STRAIGHT if current_word.transkription_positions[0].transform is None\
else current_word.transkription_positions[0].transform.get_rotation_direction()
words_on_path.append(current_word)
# look right
index = 1
end_found = False
while first_single_char_index+index < len(words_on_current_line) and not end_found:
right_word = words_on_current_line[first_single_char_index+index]
if abs(right_word.transkription_positions[len(right_word.transkription_positions)-1].bottom-current_word.transkription_positions[0].bottom) < THRESHOLD\
or (transform_direction == Matrix.DOWN\
and right_word.transkription_positions[0].bottom < current_word.transkription_positions[len(current_word.transkription_positions)-1].bottom)\
or (transform_direction == Matrix.UP \
and right_word.transkription_positions[0].bottom > current_word.transkription_positions[len(current_word.transkription_positions)-1].bottom):
if self.tree_contains_text(current_text + right_word.text):
current_text = current_text + right_word.text
words_on_path.append(right_word)
elif self.tree_contains_text(current_text + ' ' + right_word.text):
current_text = current_text + ' ' + right_word.text
words_on_path.append(right_word)
else:
end_found = True
current_word = right_word
transform_direction = Matrix.STRAIGHT if current_word.transkription_positions[0].transform is None\
else current_word.transkription_positions[0].transform.get_rotation_direction()
else:
end_found = True
index += 1
path = path + words_on_path
index -= 1
first_single_char_index += index
if first_single_char_index < len(words_on_current_line)\
and True in [ bool(re.match(r'^\w$', word.text)) for word in words_on_current_line[first_single_char_index:] ]:
return self.find_word_path(words_on_current_line[first_single_char_index:], path=path)
else:
return path
def join_single_char_words(self, page, transkription_field=None):
"""Joins words that consist of single chars if joined words are on pdf.
"""
self.sonderzeichen.remove(' ')
index = 0
single_char_words = [ word for word in page.words if re.match(r'^\w$', word.text) ]
# first check for word path going above words on the same uneven line
for line_number in sorted(set(word.line_number for word in single_char_words\
if (word.line_number % 2 == 1 and word.line_number > 0))):
words_on_current_line = [ word for word in page.words if word.line_number == line_number ]
if True in [ PositionalObject.POSITIONS_ARE_STACKED(a.transkription_positions[0], b.transkription_positions[0])\
for a in words_on_current_line\
for b in words_on_current_line\
if a != b]:
word_path = self.find_word_path(words_on_current_line)
previousWord = None
for word in word_path:
if previousWord is not None\
and PositionalObject.POSITIONS_OVERLAP_HORIZONTALLY(\
previousWord.transkription_positions[len(previousWord.transkription_positions)-1], word.transkription_positions[0]):
previousWord.join(word)
page.words.remove(word)
else:
previousWord = word
###TODO: this works only if we get the right spacing for each individual letter, look it up in svg path file
#print([word.text for word in single_char_words if word.line_number == -1])
showing_bar = not PDFText.UNITTESTING and not len(single_char_words) < 10
if showing_bar:
bar = Bar('Joining single char words', max=len(single_char_words))
while index < len(single_char_words):
showing_bar and bar.next()
if single_char_words[index] in page.words:
currentWord = single_char_words[index]
previousWord = self.get_previous_word2join(currentWord, page, transkription_field=transkription_field)
if previousWord is not None:
previousWord.join(currentWord)
page.words.remove(currentWord)
currentWord = previousWord
nextWord = self.get_next_word2join(currentWord, page, transkription_field=transkription_field)
while nextWord is not None:
currentWord.join(nextWord)
page.words.remove(nextWord)
nextWord = self.get_next_word2join(currentWord, page, transkription_field=transkription_field)
index += 1
showing_bar and bar.finish()
def compare_svgWords2pdfWords(self, page, transkription_field=None, split_wrongly_concatenated_words=False):
""" Compares each word to the word of the pdf and splits or joins them.
"""
if split_wrongly_concatenated_words:
page.words = self.split_wrongly_concatenated_words(page)
self.add_punctuation2words(page, transkription_field=transkription_field)
self.join_composita(page, transkription_field=transkription_field)
self.join_single_char_words(page, transkription_field=transkription_field)

Event Timeline