Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F85108273
test_pdf.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Sep 26, 20:19
Size
6 KB
Mime Type
text/x-python
Expires
Sat, Sep 28, 20:19 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
21129912
Attached To
rNIETZSCHEPYTHON nietzsche-python
test_pdf.py
View Options
import
unittest
from
os
import
sep
,
path
from
os.path
import
isdir
,
dirname
,
basename
import
lxml.etree
as
ET
import
sys
import
re
import
sys
sys
.
path
.
append
(
'svgscripts'
)
from
datatypes.pdf
import
PDFText
from
datatypes.page
import
Page
from
datatypes.page_creator
import
PageCreator
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.word
import
Word
from
extractWordPosition
import
Extractor
class
TestPDFText
(
unittest
.
TestCase
):
def
setUp
(
self
):
DATADIR
=
dirname
(
__file__
)
+
sep
+
'test_data'
if
not
isdir
(
DATADIR
):
DATADIR
=
dirname
(
dirname
(
__file__
))
+
sep
+
'test_data'
self
.
pdf_file
=
DATADIR
+
sep
+
'Mp_XIV_1_online_420.pdf'
self
.
pdf_fileB
=
DATADIR
+
sep
+
'W_I_8_page125.pdf'
self
.
xml420
=
DATADIR
+
sep
+
'Mp_XIV_1_page420.xml'
self
.
xml420_source
=
DATADIR
+
sep
+
'Mp_XIV_1_online_420.svg'
self
.
pdf420
=
DATADIR
+
sep
+
'Mp_XIV_1_online_420.pdf'
self
.
faulty_xml
=
DATADIR
+
sep
+
'W_I_8_faulty_page125.xml'
self
.
pdf_xml
=
DATADIR
+
sep
+
'W_I_8_page125.xml'
self
.
pdf_source
=
DATADIR
+
sep
+
"W_I_8_neu_125-01.svg"
self
.
dir
=
DATADIR
def
test_init
(
self
):
pdftext
=
PDFText
(
self
.
pdf_file
)
self
.
assertEqual
(
len
(
pdftext
.
text_tree
.
xpath
(
'.//text'
)),
102
)
self
.
assertEqual
(
len
(
pdftext
.
text_tree
.
xpath
(
'.//text[@id="{0}"]'
.
format
(
101
))),
1
)
with
self
.
assertRaises
(
Exception
):
PDFText
(
self
.
pdf_file
,
current_page_number
=
1
)
def
test_tree_contains_text_at
(
self
):
x
=
146.1
y
=
81
pdftext
=
PDFText
(
self
.
pdf_file
)
self
.
assertEqual
(
pdftext
.
tree_contains_text_at
(
'nicht'
,
x
,
y
),
True
)
def
test_tree_contains_text
(
self
):
pdftext
=
PDFText
(
self
.
pdf_fileB
)
self
.
assertEqual
(
pdftext
.
tree_contains_text
(
'richtiger(richtiger'
),
False
)
self
.
assertEqual
(
pdftext
.
tree_contains_text
(
'2ter'
),
True
)
self
.
assertEqual
(
pdftext
.
tree_contains_text_at
(
'$'
,
320
,
183
),
True
)
def
test_split_str_according_to_pdf_tree
(
self
):
pdftext
=
PDFText
(
self
.
pdf_fileB
)
self
.
assertEqual
(
pdftext
.
split_str_according_to_pdf_tree
(
'.Insofern'
),
'Insofern'
)
self
.
assertEqual
(
pdftext
.
split_str_according_to_pdf_tree
(
'sticht('
),
'sticht'
)
self
.
assertEqual
(
pdftext
.
split_str_according_to_pdf_tree
(
'.sticht('
),
'sticht'
)
def
test_split_wrongly_concatenated_words
(
self
):
page
=
Page
(
self
.
faulty_xml
)
self
.
assertEqual
(
'wünschtheißt.'
in
[
item
.
text
for
item
in
page
.
words
],
True
)
self
.
assertEqual
(
len
(
page
.
words
),
1
)
pdftext
=
PDFText
(
self
.
pdf_fileB
,
sonderzeichen
=
Extractor
.
SONDERZEICHEN_LIST
)
page
.
words
=
pdftext
.
split_wrongly_concatenated_words
(
page
)
self
.
assertEqual
(
'wünschtheißt.'
in
[
item
.
text
for
item
in
page
.
words
],
False
)
self
.
assertEqual
(
len
(
page
.
words
),
2
)
@unittest.skip
(
"have to fix PDFText.add_punctuation2words"
)
def
test_add_punctuation2words
(
self
):
page
=
Page
(
self
.
pdf_xml
)
tr
=
TranskriptionField
(
self
.
pdf_source
)
pat
=
r'^[-.=,:;?]$'
punctuations
=
[
word
for
word
in
page
.
words
if
re
.
match
(
pat
,
word
.
text
)
]
self
.
assertEqual
(
len
(
punctuations
),
5
)
self
.
assertEqual
(
len
(
page
.
words
),
430
)
pdftext
=
PDFText
(
self
.
pdf_fileB
,
sonderzeichen
=
Extractor
.
SONDERZEICHEN_LIST
)
pdftext
.
add_punctuation2words
(
page
,
transkription_field
=
tr
)
punctuations
=
[
word
for
word
in
page
.
words
if
re
.
match
(
pat
,
word
.
text
)
]
self
.
assertEqual
(
len
(
punctuations
),
1
)
self
.
assertEqual
(
len
(
page
.
words
),
426
)
@unittest.skip
(
"have to fix PDFText.join_composita"
)
def
test_add_composita
(
self
):
page
=
Page
(
self
.
pdf_xml
)
tr
=
TranskriptionField
(
self
.
pdf_source
)
pat
=
r'^[=-]\s*[A-Z]'
composita_part
=
[
word
for
word
in
page
.
words
if
re
.
match
(
pat
,
word
.
text
)
]
self
.
assertEqual
(
len
(
composita_part
),
1
)
pdftext
=
PDFText
(
self
.
pdf_fileB
,
sonderzeichen
=
Extractor
.
SONDERZEICHEN_LIST
)
pdftext
.
join_composita
(
page
,
transkription_field
=
tr
)
composita_part
=
[
word
for
word
in
page
.
words
if
re
.
match
(
pat
,
word
.
text
)
]
self
.
assertEqual
(
len
(
composita_part
),
0
)
self
.
assertEqual
(
len
(
page
.
words
),
429
)
@unittest.skip
(
"have to fix PDFText.join_single_char_words"
)
def
test_join_single_char_words
(
self
):
pat
=
r'^\w$'
"""
page = PageCreator(self.xml420, pdfFile=self.pdf420)
tr = TranskriptionField(page.source) if page.source is not None else None
page.words[:] = [ word for word in page.words if word.line_number == 13 ]
singles = [ word for word in page.words if re.match(pat, word.text) ]
#print(['{}/{}: {}'.format(word.line_number, word.id, word.text) for word in singles])
self.assertEqual(len(singles), 8)
pdftext = PDFText(page.pdfFile, sonderzeichen=Extractor.SONDERZEICHEN_LIST)
pdftext.join_single_char_words(page, transkription_field=tr)
singles = [ word for word in page.words if re.match(pat, word.text) ]
#print(['----->{}/{}: {}'.format(word.line_number, word.id, word.text) for word in singles])
self.assertEqual(len(singles), 0)
"""
page
=
PageCreator
(
self
.
pdf_xml
,
pdfFile
=
self
.
pdf_fileB
)
page
.
words
[:]
=
[
word
for
word
in
page
.
words
if
word
.
line_number
==
19
]
tr
=
TranskriptionField
(
self
.
dir
+
sep
+
page
.
source
)
if
page
.
source
is
not
None
else
None
singles
=
[
word
for
word
in
page
.
words
if
re
.
match
(
pat
,
word
.
text
)
]
self
.
assertEqual
(
len
(
singles
),
26
)
pdftext
=
PDFText
(
self
.
pdf_fileB
,
sonderzeichen
=
Extractor
.
SONDERZEICHEN_LIST
)
pdftext
.
join_single_char_words
(
page
,
transkription_field
=
tr
)
singles
=
[
word
for
word
in
page
.
words
if
re
.
match
(
pat
,
word
.
text
)
]
self
.
assertEqual
(
len
(
singles
),
0
)
self
.
assertEqual
(
':'
in
[
word
.
text
for
word
in
page
.
words
],
True
)
@unittest.skip
(
"have to fix PDFText.find_word_path"
)
def
test_find_word_path
(
self
):
page
=
PageCreator
(
self
.
pdf_xml
,
pdfFile
=
self
.
pdf_fileB
)
full_line19
=
[
word
for
word
in
page
.
words
if
word
.
line_number
==
19
]
pdftext
=
PDFText
(
self
.
pdf_fileB
,
sonderzeichen
=
Extractor
.
SONDERZEICHEN_LIST
)
words_on_path
=
pdftext
.
find_word_path
(
full_line19
)
self
.
assertEqual
(
len
(
words_on_path
),
len
([
':'
,
'aber'
,
'schon'
,
'in'
,
'der'
,
'Gebur'
,
't'
,
'd'
,
'e'
,
'r'
,
'T'
,
'r'
,
'a'
,
'g'
,
'ö'
,
'd'
,
'i'
,
'e'
,
'u'
,
'.'
,
'i'
,
'h'
,
'r'
,
'e'
,
'r'
,
'L'
,
'e'
,
'h'
,
'r'
,
'e'
,
'v'
,
'o'
,
'm'
,
'Dionys.'
,
'ist'
,
'der'
,
'Schop.'
,
'Pessimismus'
,
'überwunden.'
]))
if
__name__
==
"__main__"
:
unittest
.
main
()
Event Timeline
Log In to Comment