Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F113708298
pdf.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, May 20, 13:42
Size
21 KB
Mime Type
text/x-python
Expires
Thu, May 22, 13:42 (1 d, 10 h)
Engine
blob
Format
Raw Data
Handle
26212019
Attached To
rNIETZSCHEPYTHON nietzsche-python
pdf.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a pdf.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
import
lxml.etree
as
ET
from
pdfminer.pdfparser
import
PDFParser
from
pdfminer.pdfdocument
import
PDFDocument
from
pdfminer.pdfpage
import
PDFPage
from
pdfminer.pdfpage
import
PDFTextExtractionNotAllowed
from
pdfminer.pdfinterp
import
PDFResourceManager
from
pdfminer.pdfinterp
import
PDFPageInterpreter
from
pdfminer.pdfdevice
import
PDFDevice
from
pdfminer.layout
import
LAParams
from
pdfminer.converter
import
PDFPageAggregator
import
pdfminer
from
progress.bar
import
Bar
import
re
import
warnings
from
os
import
path
from
os.path
import
isfile
,
sep
from
.positional_object
import
PositionalObject
from
.matrix
import
Matrix
class
PDFText
:
"""This class represents a pdf and extracts text from it.
Args:
pdfFile (str): the pdf file name.
current_page_number (int) the current page of the pdf.
"""
UNITTESTING
=
False
def
__init__
(
self
,
pdfFile
,
current_page_number
=
0
,
sonderzeichen
=
[]):
self
.
pdfFile
=
pdfFile
self
.
sonderzeichen
=
[
''
,
' '
]
if
len
(
sonderzeichen
)
==
0
\
else
[
''
,
' '
]
+
sonderzeichen
+
[
a
+
b
for
a
in
sonderzeichen
for
b
in
sonderzeichen
]
fp
=
open
(
self
.
pdfFile
,
'rb'
)
document
=
PDFDocument
(
PDFParser
(
fp
))
if
not
document
.
is_extractable
:
raise
PDFTextExtractionNotAllowed
self
.
current_page_number
=
current_page_number
self
.
text_tree
=
ET
.
ElementTree
(
ET
.
Element
(
'pdf'
))
pages
=
[
page
for
page
in
PDFPage
.
create_pages
(
document
)]
if
len
(
pages
)
>
self
.
current_page_number
:
self
.
current_page
=
pages
[
self
.
current_page_number
]
rsrcmgr
=
PDFResourceManager
()
device
=
PDFDevice
(
rsrcmgr
)
laparams
=
LAParams
()
device
=
PDFPageAggregator
(
rsrcmgr
,
laparams
=
laparams
)
interpreter
=
PDFPageInterpreter
(
rsrcmgr
,
device
)
interpreter
.
process_page
(
self
.
current_page
)
layout
=
device
.
get_result
()
for
obj
in
layout
.
_objs
:
if
isinstance
(
obj
,
pdfminer
.
layout
.
LTText
):
id
=
len
(
self
.
text_tree
.
xpath
(
'.//text'
))
text_node
=
ET
.
SubElement
(
self
.
text_tree
.
getroot
(),
'text'
,
\
attrib
=
{
'id'
:
str
(
id
),
\
'xmin'
:
str
(
round
(
obj
.
bbox
[
0
],
3
)),
'ymin'
:
str
(
round
(
obj
.
bbox
[
1
],
3
)),
'xmax'
:
str
(
round
(
obj
.
bbox
[
2
],
3
)),
'ymax'
:
str
(
round
(
obj
.
bbox
[
3
],
3
))})
text_node
.
text
=
obj
.
get_text
()
.
replace
(
'
\n
'
,
''
)
fp
.
close
()
else
:
fp
.
close
()
raise
Exception
(
'File {} does not contain page number {}'
.
format
(
self
.
pdfFile
,
self
.
current_page_number
))
def
tree_contains_text_at
(
self
,
text
,
left
,
bottom
):
"""Returns whether tree contains the text at the specified position.
"""
OFFSET
=
3
x
=
left
+
OFFSET
y
=
self
.
current_page
.
attrs
[
'MediaBox'
][
3
]
-
bottom
+
OFFSET
return
len
(
self
.
text_tree
.
xpath
(
\
".//text[contains(., '{0}') and @xmin<={1} and @xmax>={1} and @ymin<={2} and @ymax>={2}]"
.
format
(
text
,
x
,
y
))
\
)
>
0
def
tree_contains_text
(
self
,
text
):
"""Returns whether tree contains the text at the specified position.
"""
return
len
(
self
.
text_tree
.
xpath
(
".//text[contains(., '{0}')]"
.
format
(
text
)))
>
0
def
split_str_according_to_pdf_tree
(
self
,
text
):
"""Returns the string that has been found in the tree
"""
if
self
.
tree_contains_text
(
text
):
return
text
elif
self
.
tree_contains_text
(
text
[
1
:]):
return
text
[
1
:]
elif
self
.
tree_contains_text
(
text
[:
len
(
text
)
-
1
]):
return
text
[:
len
(
text
)
-
1
]
elif
self
.
tree_contains_text
(
text
[
1
:
len
(
text
)
-
1
]):
return
text
[
1
:
len
(
text
)
-
1
]
else
:
return
''
def
split_wrongly_concatenated_words
(
self
,
page
):
"""Test for falsely concatenated words and split them
[:returns:] an updated Array of all (datatypes.word) Words
"""
new_words
=
[]
for
word
in
page
.
words
:
# test for falsely concatenated words and split them
if
self
.
tree_contains_text
(
word
.
text
):
new_words
.
append
(
word
)
else
:
index
=
len
(
word
.
text
)
word_found
=
False
while
not
word_found
and
index
>
0
:
result
=
self
.
split_str_according_to_pdf_tree
(
word
.
text
[:
index
])
if
len
(
result
)
>
0
:
word_found
=
True
previousWord
,
currentWord
,
nextWord
=
word
.
split
(
result
,
start_id
=
len
(
page
.
words
))
if
previousWord
is
not
None
:
new_words
.
append
(
previousWord
)
new_words
.
append
(
currentWord
)
if
nextWord
is
not
None
:
new_words
.
append
(
nextWord
)
else
:
index
-=
1
if
not
word_found
:
warnings
.
warn
(
'ATTENTION: Word not found: {} on line {}: {}!'
.
format
(
word
.
id
,
word
.
line_number
,
word
.
text
))
return
new_words
def
get_previous_word2join
(
self
,
word2join
,
page
,
transkription_field
=
None
):
"""Finds previous word to word2join and returns it after testing if joined word is on pdf.
"""
previousWord
=
None
previousWord_index
=
0
THRESHOLD
=
1.5
LEFTDIFF
=
100
# a reverse sorted list of words that are left to word2join -> first item should be word to join.
previous_word_list
=
sorted
([
word
for
word
in
page
.
words
\
if
word
.
line_number
==
word2join
.
line_number
\
and
word
.
transkription_positions
[
len
(
word
.
transkription_positions
)
-
1
]
.
left
<
word2join
.
transkription_positions
[
0
]
.
left
\
and
abs
(
word
.
transkription_positions
[
len
(
word
.
transkription_positions
)
-
1
]
.
bottom
-
word2join
.
transkription_positions
[
0
]
.
bottom
)
<
THRESHOLD
],
\
key
=
lambda
word
:
word
.
transkription_positions
[
0
]
.
left
,
reverse
=
True
)
if
word2join
.
line_number
==
-
1
or
True
in
[
(
position
.
transform
is
not
None
)
for
position
in
word2join
.
transkription_positions
]:
previous_word_list
=
sorted
([
word
for
word
in
page
.
words
\
if
abs
(
word
.
transkription_positions
[
len
(
word
.
transkription_positions
)
-
1
]
.
bottom
-
word2join
.
transkription_positions
[
0
]
.
bottom
)
<
THRESHOLD
\
and
abs
(
word
.
transkription_positions
[
len
(
word
.
transkription_positions
)
-
1
]
.
left
-
word2join
.
transkription_positions
[
0
]
.
left
)
<
LEFTDIFF
\
and
word
.
transkription_positions
[
len
(
word
.
transkription_positions
)
-
1
]
.
left
<
word2join
.
transkription_positions
[
0
]
.
left
],
\
key
=
lambda
word
:
word
.
transkription_positions
[
0
]
.
left
,
reverse
=
True
)
#print('{}/{}: {} ->{}'.format(word2join.line_number, word2join.id, word2join.text, '#'.join([word.text for word in previous_word_list])))
while
previousWord
is
None
and
previousWord_index
<
len
(
previous_word_list
):
currentWord
=
previous_word_list
[
previousWord_index
]
left
=
currentWord
.
transkription_positions
[
0
]
.
left
+
transkription_field
.
xmin
\
if
transkription_field
is
not
None
else
currentWord
.
transkription_positions
[
0
]
.
left
bottom
=
currentWord
.
transkription_positions
[
0
]
.
bottom
+
transkription_field
.
ymin
\
if
transkription_field
is
not
None
else
currentWord
.
transkription_positions
[
0
]
.
bottom
text_list
=
[
currentWord
.
text
+
sonderzeichen
+
word2join
.
text
for
sonderzeichen
in
self
.
sonderzeichen
]
if
True
in
[
self
.
tree_contains_text_at
(
text
,
left
,
bottom
)
for
text
in
text_list
]:
previousWord
=
currentWord
previousWord_index
+=
1
return
previousWord
def
get_next_word2join
(
self
,
word2join
,
page
,
transkription_field
=
None
):
"""Finds next word to join word2join and returns if after testing if joined word is on pdf.
"""
nextWord
=
None
nextWord_index
=
0
THRESHOLD
=
1.5
LEFTDIFF
=
100
# a sorted list of words that are right to word2join -> first item should be word to join.
next_word_list
=
sorted
([
word
for
word
in
page
.
words
\
if
word
.
line_number
==
word2join
.
line_number
\
and
word
.
transkription_positions
[
0
]
.
left
>
word2join
.
transkription_positions
[
0
]
.
left
\
and
abs
(
word
.
transkription_positions
[
0
]
.
bottom
-
word2join
.
transkription_positions
[
0
]
.
bottom
)
<
THRESHOLD
],
\
key
=
lambda
word
:
word
.
transkription_positions
[
0
]
.
left
)
if
word2join
.
line_number
==
-
1
or
True
in
[
(
position
.
transform
is
not
None
)
for
position
in
word2join
.
transkription_positions
]:
next_word_list
=
sorted
([
word
for
word
in
page
.
words
\
if
abs
(
word
.
transkription_positions
[
0
]
.
bottom
-
word2join
.
transkription_positions
[
0
]
.
bottom
)
<
THRESHOLD
\
and
abs
(
word
.
transkription_positions
[
0
]
.
left
-
word2join
.
transkription_positions
[
len
(
word2join
.
transkription_positions
)
-
1
]
.
left
)
<
LEFTDIFF
\
and
word
.
transkription_positions
[
0
]
.
left
>
word2join
.
transkription_positions
[
len
(
word2join
.
transkription_positions
)
-
1
]
.
left
],
\
key
=
lambda
word
:
word
.
transkription_positions
[
0
]
.
left
)
while
nextWord
is
None
and
nextWord_index
<
len
(
next_word_list
):
currentWord
=
next_word_list
[
nextWord_index
]
left
=
currentWord
.
transkription_positions
[
0
]
.
left
+
transkription_field
.
xmin
\
if
transkription_field
is
not
None
else
currentWord
.
transkription_positions
[
0
]
.
left
bottom
=
currentWord
.
transkription_positions
[
0
]
.
bottom
+
transkription_field
.
ymin
\
if
transkription_field
is
not
None
else
currentWord
.
transkription_positions
[
0
]
.
bottom
text_list
=
[
word2join
.
text
+
sonderzeichen
+
currentWord
.
text
for
sonderzeichen
in
self
.
sonderzeichen
]
if
True
in
[
self
.
tree_contains_text_at
(
text
,
left
,
bottom
)
for
text
in
text_list
]:
nextWord
=
currentWord
nextWord_index
+=
1
return
nextWord
def
add_punctuation2words
(
self
,
page
,
transkription_field
=
None
):
"""Join words that consist of punctuation only to words.
"""
punctuation_pattern
=
r'^[.,:;?]$'
punctuation_words
=
[
word
for
word
in
page
.
words
if
re
.
match
(
punctuation_pattern
,
word
.
text
)
]
showing_bar
=
not
PDFText
.
UNITTESTING
and
not
len
(
punctuation_words
)
<
10
if
showing_bar
:
bar
=
Bar
(
'Joining punctuations with words'
,
max
=
len
(
punctuation_words
))
for
punctuation_word
in
punctuation_words
:
showing_bar
and
bar
.
next
()
previousWord
=
self
.
get_previous_word2join
(
punctuation_word
,
page
,
transkription_field
=
transkription_field
)
if
previousWord
is
not
None
:
previousWord
.
join
(
punctuation_word
)
page
.
words
.
remove
(
punctuation_word
)
showing_bar
and
bar
.
finish
()
def
join_composita
(
self
,
page
,
transkription_field
=
None
):
"""Joins composita.
"""
connection_words
=
[
word
for
word
in
page
.
words
if
re
.
match
(
r'^[-=]$'
,
word
.
text
)
]
showing_bar
=
not
PDFText
.
UNITTESTING
and
not
len
(
connection_words
)
<
10
if
showing_bar
:
bar
=
Bar
(
'Joining composita'
,
max
=
len
(
connection_words
))
for
connection_word
in
connection_words
:
showing_bar
and
bar
.
next
()
previousWord
=
self
.
get_previous_word2join
(
connection_word
,
page
,
transkription_field
=
transkription_field
)
nextWord
=
self
.
get_next_word2join
(
connection_word
,
page
,
transkription_field
=
transkription_field
)
if
previousWord
is
not
None
:
previousWord
.
join
(
connection_word
)
page
.
words
.
remove
(
connection_word
)
if
nextWord
is
not
None
:
previousWord
.
join
(
nextWord
)
page
.
words
.
remove
(
nextWord
)
elif
nextWord
is
not
None
:
connection_word
.
join
(
nextWord
)
page
.
words
.
remove
(
nextWord
)
composita_pattern
=
r'^[=-]\s*[A-Z]'
for
composita_word
in
[
word
for
word
in
page
.
words
if
re
.
match
(
composita_pattern
,
word
.
text
)
]:
previousWord
=
self
.
get_previous_word2join
(
composita_word
,
page
,
transkription_field
=
transkription_field
)
if
previousWord
is
not
None
:
previousWord
.
join
(
composita_word
)
page
.
words
.
remove
(
composita_word
)
showing_bar
and
bar
.
finish
()
def
find_word_path
(
self
,
words_on_current_line
,
path
=
[]):
"""Finds the words that form a path above or beneath words on the same uneven line.
[:return:] a list of word that belong to this path in the proper order.
"""
if
len
(
words_on_current_line
)
<
2
:
return
path
THRESHOLD
=
1.5
words_on_path
=
[]
words_on_current_line
=
sorted
(
words_on_current_line
,
key
=
lambda
word
:
word
.
transkription_positions
[
0
]
.
left
)
first_single_char_index
=
[
bool
(
re
.
match
(
r'^\w$'
,
word
.
text
))
for
word
in
words_on_current_line
]
.
index
(
True
)
current_word
=
words_on_current_line
[
first_single_char_index
]
transform_direction
=
Matrix
.
STRAIGHT
if
current_word
.
transkription_positions
[
0
]
.
transform
is
None
\
else
current_word
.
transkription_positions
[
0
]
.
transform
.
get_rotation_direction
()
# look left
index
=
1
start_found
=
False
current_text
=
current_word
.
text
while
first_single_char_index
-
index
>=
0
and
not
start_found
:
left_word
=
words_on_current_line
[
first_single_char_index
-
index
]
if
abs
(
left_word
.
transkription_positions
[
len
(
left_word
.
transkription_positions
)
-
1
]
.
bottom
-
current_word
.
transkription_positions
[
0
]
.
bottom
)
<
THRESHOLD
\
or
(
transform_direction
*-
1
==
Matrix
.
DOWN
\
and
left_word
.
transkription_positions
[
len
(
left_word
.
transkription_positions
)
-
1
]
.
bottom
<
current_word
.
transkription_positions
[
0
]
.
bottom
)
\
or
(
transform_direction
*-
1
==
Matrix
.
UP
\
and
left_word
.
transkription_positions
[
len
(
left_word
.
transkription_positions
)
-
1
]
.
bottom
>
current_word
.
transkription_positions
[
0
]
.
bottom
):
if
self
.
tree_contains_text
(
left_word
.
text
+
current_text
):
current_text
=
left_word
.
text
+
current_text
words_on_path
.
insert
(
0
,
left_word
)
elif
self
.
tree_contains_text
(
left_word
.
text
+
' '
+
current_text
):
current_text
=
left_word
.
text
+
' '
+
current_text
words_on_path
.
insert
(
0
,
left_word
)
else
:
start_found
=
True
current_word
=
left_word
transform_direction
=
Matrix
.
STRAIGHT
if
current_word
.
transkription_positions
[
0
]
.
transform
is
None
\
else
current_word
.
transkription_positions
[
0
]
.
transform
.
get_rotation_direction
()
else
:
start_found
=
True
index
+=
1
current_word
=
words_on_current_line
[
first_single_char_index
]
transform_direction
=
Matrix
.
STRAIGHT
if
current_word
.
transkription_positions
[
0
]
.
transform
is
None
\
else
current_word
.
transkription_positions
[
0
]
.
transform
.
get_rotation_direction
()
words_on_path
.
append
(
current_word
)
# look right
index
=
1
end_found
=
False
while
first_single_char_index
+
index
<
len
(
words_on_current_line
)
and
not
end_found
:
right_word
=
words_on_current_line
[
first_single_char_index
+
index
]
if
abs
(
right_word
.
transkription_positions
[
len
(
right_word
.
transkription_positions
)
-
1
]
.
bottom
-
current_word
.
transkription_positions
[
0
]
.
bottom
)
<
THRESHOLD
\
or
(
transform_direction
==
Matrix
.
DOWN
\
and
right_word
.
transkription_positions
[
0
]
.
bottom
<
current_word
.
transkription_positions
[
len
(
current_word
.
transkription_positions
)
-
1
]
.
bottom
)
\
or
(
transform_direction
==
Matrix
.
UP
\
and
right_word
.
transkription_positions
[
0
]
.
bottom
>
current_word
.
transkription_positions
[
len
(
current_word
.
transkription_positions
)
-
1
]
.
bottom
):
if
self
.
tree_contains_text
(
current_text
+
right_word
.
text
):
current_text
=
current_text
+
right_word
.
text
words_on_path
.
append
(
right_word
)
elif
self
.
tree_contains_text
(
current_text
+
' '
+
right_word
.
text
):
current_text
=
current_text
+
' '
+
right_word
.
text
words_on_path
.
append
(
right_word
)
else
:
end_found
=
True
current_word
=
right_word
transform_direction
=
Matrix
.
STRAIGHT
if
current_word
.
transkription_positions
[
0
]
.
transform
is
None
\
else
current_word
.
transkription_positions
[
0
]
.
transform
.
get_rotation_direction
()
else
:
end_found
=
True
index
+=
1
path
=
path
+
words_on_path
index
-=
1
first_single_char_index
+=
index
if
first_single_char_index
<
len
(
words_on_current_line
)
\
and
True
in
[
bool
(
re
.
match
(
r'^\w$'
,
word
.
text
))
for
word
in
words_on_current_line
[
first_single_char_index
:]
]:
return
self
.
find_word_path
(
words_on_current_line
[
first_single_char_index
:],
path
=
path
)
else
:
return
path
def
join_single_char_words
(
self
,
page
,
transkription_field
=
None
):
"""Joins words that consist of single chars if joined words are on pdf.
"""
self
.
sonderzeichen
.
remove
(
' '
)
index
=
0
single_char_words
=
[
word
for
word
in
page
.
words
if
re
.
match
(
r'^\w$'
,
word
.
text
)
]
# first check for word path going above words on the same uneven line
for
line_number
in
sorted
(
set
(
word
.
line_number
for
word
in
single_char_words
\
if
(
word
.
line_number
%
2
==
1
and
word
.
line_number
>
0
))):
words_on_current_line
=
[
word
for
word
in
page
.
words
if
word
.
line_number
==
line_number
]
if
True
in
[
PositionalObject
.
POSITIONS_ARE_STACKED
(
a
.
transkription_positions
[
0
],
b
.
transkription_positions
[
0
])
\
for
a
in
words_on_current_line
\
for
b
in
words_on_current_line
\
if
a
!=
b
]:
word_path
=
self
.
find_word_path
(
words_on_current_line
)
previousWord
=
None
for
word
in
word_path
:
if
previousWord
is
not
None
\
and
PositionalObject
.
POSITIONS_OVERLAP_HORIZONTALLY
(
\
previousWord
.
transkription_positions
[
len
(
previousWord
.
transkription_positions
)
-
1
],
word
.
transkription_positions
[
0
]):
previousWord
.
join
(
word
)
page
.
words
.
remove
(
word
)
else
:
previousWord
=
word
###TODO: this works only if we get the right spacing for each individual letter, look it up in svg path file
#print([word.text for word in single_char_words if word.line_number == -1])
showing_bar
=
not
PDFText
.
UNITTESTING
and
not
len
(
single_char_words
)
<
10
if
showing_bar
:
bar
=
Bar
(
'Joining single char words'
,
max
=
len
(
single_char_words
))
while
index
<
len
(
single_char_words
):
showing_bar
and
bar
.
next
()
if
single_char_words
[
index
]
in
page
.
words
:
currentWord
=
single_char_words
[
index
]
previousWord
=
self
.
get_previous_word2join
(
currentWord
,
page
,
transkription_field
=
transkription_field
)
if
previousWord
is
not
None
:
previousWord
.
join
(
currentWord
)
page
.
words
.
remove
(
currentWord
)
currentWord
=
previousWord
nextWord
=
self
.
get_next_word2join
(
currentWord
,
page
,
transkription_field
=
transkription_field
)
while
nextWord
is
not
None
:
currentWord
.
join
(
nextWord
)
page
.
words
.
remove
(
nextWord
)
nextWord
=
self
.
get_next_word2join
(
currentWord
,
page
,
transkription_field
=
transkription_field
)
index
+=
1
showing_bar
and
bar
.
finish
()
def
compare_svgWords2pdfWords
(
self
,
page
,
transkription_field
=
None
,
split_wrongly_concatenated_words
=
False
):
""" Compares each word to the word of the pdf and splits or joins them.
"""
if
split_wrongly_concatenated_words
:
page
.
words
=
self
.
split_wrongly_concatenated_words
(
page
)
self
.
add_punctuation2words
(
page
,
transkription_field
=
transkription_field
)
self
.
join_composita
(
page
,
transkription_field
=
transkription_field
)
self
.
join_single_char_words
(
page
,
transkription_field
=
transkription_field
)
Event Timeline
Log In to Comment