Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F86467903
test_extractWordPosition.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Oct 6, 16:21
Size
14 KB
Mime Type
text/x-python
Expires
Tue, Oct 8, 16:21 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
21426539
Attached To
rNIETZSCHEPYTHON nietzsche-python
test_extractWordPosition.py
View Options
import
unittest
import
os
from
os
import
sep
,
path
from
os.path
import
isfile
,
isdir
,
dirname
import
re
import
shutil
import
tempfile
import
lxml.etree
as
ET
from
lxml.etree
import
XMLSyntaxError
import
sys
sys
.
path
.
append
(
'svgscripts'
)
import
extractWordPosition
from
myxmlwriter
import
write_pretty
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.matrix
import
Matrix
from
datatypes.page_creator
import
PageCreator
,
FILE_TYPE_SVG_WORD_POSITION
from
datatypes.page
import
Page
from
datatypes.positional_word_part
import
PositionalWordPart
from
datatypes.pdf
import
PDFText
from
datatypes.word
import
Word
from
datatypes.lineNumber
import
LineNumber
from
datatypes.word_insertion_mark
import
WordInsertionMark
def
test_write
(
xml_element_tree
=
None
,
file_name
=
None
):
write_pretty
(
xml_element_tree
=
xml_element_tree
,
file_name
=
None
,
script_name
=
'test'
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
class
TestExtractor
(
unittest
.
TestCase
):
def
setUp
(
self
):
extractWordPosition
.
Extractor
.
UNITTESTING
=
True
DATADIR
=
dirname
(
__file__
)
+
sep
+
'test_data'
self
.
test_file_find_word
=
DATADIR
+
sep
+
'test_find_word.xml'
self
.
test_dir
=
tempfile
.
mkdtemp
()
self
.
title
=
'ABC 111'
self
.
matrix_string
=
'matrix(1 0 0 1 183.6558 197.9131)'
self
.
test_file
=
DATADIR
+
sep
+
'Mp_XIV_1_mytest_421.svg'
self
.
test_empty_file
=
DATADIR
+
sep
+
'my_empty_test.svg'
self
.
test_source
=
DATADIR
+
sep
+
'Mp_XIV_1_mytest_421.xml'
self
.
xml420
=
DATADIR
+
sep
+
'Mp_XIV_1_page420.xml'
self
.
pdf420
=
DATADIR
+
sep
+
'Mp_XIV_1_online_420.pdf'
self
.
pdf_file
=
DATADIR
+
sep
+
'W_I_8_page125.pdf'
self
.
faulty_xml
=
DATADIR
+
sep
+
'W_I_8_faulty_page125.xml'
self
.
pdf_xml
=
DATADIR
+
sep
+
'W_I_8_page125.xml'
self
.
pdf_xml_source
=
DATADIR
+
sep
+
'W_I_8_neu_125-01.svg'
self
.
testA
=
DATADIR
+
sep
+
'testA.xml'
self
.
multipage
=
DATADIR
+
sep
+
'multipage_small_above.svg'
def
test_extract_information
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
page
=
extractor
.
extract_information
(
self
.
multipage
,
multipage_index
=
0
)
self
.
assertEqual
(
len
(
page
.
words
),
59
)
self
.
assertEqual
(
page
.
multipage_index
,
0
)
page
=
extractor
.
extract_information
(
self
.
multipage
,
multipage_index
=
1
)
self
.
assertEqual
(
page
.
multipage_index
,
1
)
self
.
assertTrue
(
len
(
page
.
words
)
>
59
)
extractor
=
extractWordPosition
.
Extractor
()
source_page
=
Page
(
'xml/Mp_XV_page78v.xml'
)
extractor
=
extractWordPosition
.
Extractor
()
transkription_field
=
TranskriptionField
(
source_page
.
source
)
svg_tree
=
ET
.
parse
(
source_page
.
source
)
text_items
=
extractor
.
get_text_items
(
svg_tree
.
getroot
(),
transkription_field
=
transkription_field
)
self
.
assertTrue
(
'matrix(1 0 0 1 115.6299 719.3535)'
in
[
item
.
get
(
'transform'
)
for
item
in
text_items
])
page
=
extractor
.
extract_information
(
source_page
.
source
,
svg_file
=
source_page
.
svg_file
)
self
.
assertTrue
(
page
.
svg_image
.
text_field
is
not
None
)
##:map <F5> :w<Enter>:!python3 -m unittest tests_svgscripts.test_extractWordPosition.TestExtractor.test_improved_extract_word_position<Enter>
@unittest.skip
(
'test with local file'
)
def
test_improved_extract_word_position
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
source_page
=
Page
(
'xml/Mp_XV_page85v.xml'
)
print
(
len
(
source_page
.
words
))
source_page
.
words
=
[]
extractor
=
extractWordPosition
.
Extractor
()
transkription_field
=
TranskriptionField
(
source_page
.
source
)
svg_tree
=
ET
.
parse
(
source_page
.
source
)
extractor
.
improved_extract_word_position
(
svg_tree
,
source_page
,
transkription_field
=
transkription_field
)
print
(
len
(
source_page
.
words
))
#for word in source_page.words: print(word.id, word.text)
##:map <F5> :w<Enter>:!python3 -m unittest tests_svgscripts.test_extractWordPosition.TestExtractor.test_process_pwps_break_points<Enter>
def
test_process_pwps_break_points
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
source_page
=
Page
(
'xml/Mp_XV_page86r.xml'
)
word
=
[
word
for
word
in
source_page
.
words
if
word
.
text
==
'Sorgen'
and
word
.
line_number
==
2
][
0
]
pwps
=
word
.
transkription_positions
[
0
]
.
positional_word_parts
wim
=
[
wim
for
wim
in
source_page
.
word_insertion_marks
if
wim
.
id
==
'5'
][
0
]
svg_path_tree
=
ET
.
parse
(
source_page
.
svg_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_path_tree
.
getroot
()
.
nsmap
.
items
()
}
pwps
.
append
(
PositionalWordPart
(
text
=
wim
.
mark_type
,
symbol_id
=
wim
.
symbol_id
,
x
=
wim
.
left
,
y
=
wim
.
top
,
height
=
wim
.
height
,
width
=
wim
.
width
,
\
style_class
=
source_page
.
sonderzeichen_list
[
0
]))
word
=
[
word
for
word
in
source_page
.
words
if
word
.
text
==
'los'
and
word
.
line_number
==
2
][
0
]
pwps
+=
word
.
transkription_positions
[
0
]
.
positional_word_parts
source_page
.
words
=
[]
break_points
=
extractor
.
_get_pwps_break_points
(
source_page
,
pwps
)
extractor
.
_process_pwps_break_points
(
break_points
,
source_page
,
0
,
pwps
)
print
([
word
.
text
for
word
in
source_page
.
words
])
"""
source_page.words = []
extractor = extractWordPosition.Extractor()
transkription_field = TranskriptionField(source_page.source)
svg_tree = ET.parse(source_page.source)
extractor.improved_extract_word_position(svg_tree, source_page, transkription_field=transkription_field)
print(len(source_page.words))
#for word in source_page.words: print(word.id, word.text)
"""
def
test_update_title
(
self
):
extractor
=
extractWordPosition
.
Extractor
(
xml_dir
=
self
.
test_dir
)
extractor
.
update_title_and_manuscript
(
'test'
)
self
.
assertEqual
(
extractor
.
title
,
'test'
)
self
.
assertEqual
(
extractor
.
manuscript_file
,
'{}/test.xml'
.
format
(
self
.
test_dir
))
self
.
assertEqual
(
isfile
(
'{}/test.xml'
.
format
(
self
.
test_dir
)),
True
)
def
test_get_page_number
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
self
.
assertEqual
(
extractor
.
get_page_number
(
self
.
test_file
,
page_number
=
'1'
),
'001'
)
self
.
assertEqual
(
extractor
.
get_page_number
(
self
.
test_file
),
'421'
)
def
test_get_file_name
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
self
.
assertEqual
(
extractor
.
get_file_name
(
self
.
test_file
),
'xml/Mp_XIV_1_mytest_421.xml'
)
extractor
=
extractWordPosition
.
Extractor
(
title
=
self
.
title
)
self
.
assertEqual
(
extractor
.
get_file_name
(
self
.
test_file
),
'xml/{}_page421.xml'
.
format
(
self
.
title
.
replace
(
' '
,
'_'
)))
extractorA
=
extractWordPosition
.
Extractor
(
title
=
self
.
title
)
extractorB
=
extractWordPosition
.
Extractor
(
manuscript_file
=
extractorA
.
manuscript_file
)
self
.
assertEqual
(
extractorB
.
get_file_name
(
self
.
test_file
),
'xml/{}_page421.xml'
.
format
(
self
.
title
.
replace
(
' '
,
'_'
)))
def
test_get_style
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
svg_tree
=
ET
.
parse
(
self
.
test_file
)
sonderzeichen_list
,
letterspacing_list
,
style_dict
=
extractor
.
get_style
(
svg_tree
.
getroot
())
self
.
assertEqual
(
sonderzeichen_list
,
[
'st21'
,
'st23'
])
self
.
assertEqual
(
style_dict
.
get
(
'st11'
)
.
get
(
'font-family'
),
'Frutiger-Europeen'
)
self
.
assertEqual
(
style_dict
.
get
(
'st5'
)
.
get
(
'stroke'
),
'#CED5CE'
)
def
test_get_word_from_part_obj
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
mylist
=
[{
'text'
:
'a'
,
'class'
:
'asdf'
},
{
'text'
:
'b'
,
'endX'
:
0
},
{
'text'
:
'c'
}]
self
.
assertEqual
(
extractor
.
get_word_from_part_obj
(
mylist
),
'abc'
)
def
test_get_break_points
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
page
=
Page
(
self
.
pdf_xml
)
page
.
source
=
self
.
pdf_xml_source
matrix
=
Matrix
(
'matrix(1 0 0 1 543.8164 173.9126)'
)
matrixB
=
Matrix
(
'matrix(1 0 0 1 573.6758 173.9126)'
)
matrixC
=
Matrix
(
'matrix(1 0 0 1 575.9873 173.9126)'
)
mylist
=
[{
'text'
:
'es'
,
'class'
:
'st5 st6'
,
'x'
:
matrix
.
add2X
(
23.968
),
'y'
:
matrix
.
getY
()
},
\
{
'text'
:
'A'
,
'class'
:
'st9 st10'
,
'x'
:
matrixB
.
getX
(),
'y'
:
matrixB
.
getY
()
},
\
{
'text'
:
'sich'
,
'class'
:
"st5 st6"
,
'x'
:
matrixC
.
getX
(),
'y'
:
matrixC
.
getY
()}]
break_points
=
extractor
.
_get_break_points
(
page
,
mylist
)
self
.
assertTrue
(
len
(
break_points
)
>
0
)
def
test_get_pwps_break_points
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
page
=
Page
(
self
.
pdf_xml
)
page
.
svg_file
=
"./svg/W_I_8_page125_web.svg"
page
.
source
=
self
.
pdf_xml_source
svg_path_tree
=
ET
.
parse
(
page
.
svg_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_path_tree
.
getroot
()
.
nsmap
.
items
()
}
matrix
=
Matrix
(
'matrix(1 0 0 1 543.8164 173.9126)'
)
matrixB
=
Matrix
(
'matrix(1 0 0 1 573.6758 173.9126)'
)
matrixC
=
Matrix
(
'matrix(1 0 0 1 575.9873 173.9126)'
)
mylist
=
[{
'text'
:
'es'
,
'class'
:
'st5 st6'
,
'x'
:
matrix
.
add2X
(
23.968
),
'y'
:
matrix
.
getY
()
},
\
{
'text'
:
'A'
,
'class'
:
'st9 st10'
,
'x'
:
matrixB
.
getX
(),
'y'
:
matrixB
.
getY
()
},
\
{
'text'
:
'sich'
,
'class'
:
"st5 st6"
,
'x'
:
matrixC
.
getX
(),
'y'
:
matrixC
.
getY
()}]
pwps
=
[]
for
word_part_obj
in
mylist
:
pwps
+=
PositionalWordPart
.
CREATE_POSITIONAL_WORD_PART_LIST
(
word_part_obj
,
svg_path_tree
,
namespaces
,
page
=
page
)
self
.
assertTrue
(
len
(
page
.
sonderzeichen_list
)
>
0
)
break_points
=
extractor
.
_get_pwps_break_points
(
page
,
pwps
)
self
.
assertTrue
(
len
(
break_points
)
>
0
)
def
test_get_text_items
(
self
):
svg_tree
=
ET
.
parse
(
self
.
test_file
)
extractor
=
extractWordPosition
.
Extractor
()
mytest_items
=
[
x
for
x
in
extractor
.
get_text_items
(
svg_tree
.
getroot
())
]
self
.
assertEqual
(
len
(
mytest_items
),
300
)
self
.
assertEqual
(
mytest_items
[
0
]
.
get
(
'transform'
),
'matrix(1 0 0 1 386.8218 57.1914)'
)
tf
=
TranskriptionField
(
self
.
test_file
)
mytest_itemsTF
=
[
x
for
x
in
extractor
.
get_text_items
(
svg_tree
.
getroot
(),
transkription_field
=
tf
)
]
self
.
assertEqual
(
mytest_itemsTF
[
0
]
.
get
(
'transform'
),
'matrix(1 0 0 1 204.8618 91.7134)'
)
def
test_init_tree_and_target_file
(
self
):
target_file
=
self
.
testA
page
=
PageCreator
(
target_file
,
title
=
self
.
title
)
tree
=
page
.
page_tree
self
.
assertEqual
(
tree
.
getroot
()
.
get
(
'title'
),
self
.
title
)
self
.
assertEqual
(
tree
.
getroot
()
.
findall
(
'./style'
),
[])
test_write
(
xml_element_tree
=
tree
,
file_name
=
target_file
)
page
=
PageCreator
(
target_file
)
tree
=
page
.
page_tree
self
.
assertEqual
(
tree
.
getroot
()
.
get
(
'title'
),
self
.
title
)
self
.
assertEqual
(
tree
.
getroot
()
.
findall
(
'./style'
),
[])
isfile
(
target_file
)
and
os
.
remove
(
target_file
)
def
test_add_style
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
svg_tree
=
ET
.
parse
(
self
.
test_file
)
sonderzeichen_list
,
letterspacing_list
,
style_dict
=
extractor
.
get_style
(
svg_tree
.
getroot
())
target_file
=
self
.
testA
page
=
PageCreator
(
target_file
,
title
=
self
.
title
)
page
.
add_style
(
sonderzeichen_list
=
sonderzeichen_list
,
style_dict
=
style_dict
)
test_write
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
target_file
)
fromTarget_xml_tree
=
ET
.
parse
(
target_file
)
self
.
assertEqual
(
fromTarget_xml_tree
.
getroot
()
.
get
(
'title'
),
self
.
title
)
self
.
assertEqual
(
fromTarget_xml_tree
.
getroot
()
.
find
(
"style"
)
.
get
(
'Sonderzeichen'
),
"st21 st23"
)
self
.
assertEqual
(
fromTarget_xml_tree
.
getroot
()
.
find
(
"style"
)
.
find
(
"class[@name='st5']"
)
.
get
(
'stroke'
),
'#CED5CE'
)
self
.
assertEqual
(
fromTarget_xml_tree
.
getroot
()
.
find
(
"style"
)
.
find
(
"class[@name='st11']"
)
.
get
(
'font-family'
),
'Frutiger-Europeen'
)
page
=
PageCreator
(
target_file
)
page
.
add_style
(
sonderzeichen_list
=
sonderzeichen_list
,
style_dict
=
style_dict
)
test_write
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
target_file
)
fromTarget_xml_tree
=
ET
.
parse
(
target_file
)
self
.
assertEqual
(
fromTarget_xml_tree
.
getroot
()
.
get
(
'title'
),
self
.
title
)
self
.
assertEqual
(
fromTarget_xml_tree
.
getroot
()
.
find
(
"style"
)
.
get
(
'Sonderzeichen'
),
"st21 st23"
)
self
.
assertEqual
(
fromTarget_xml_tree
.
getroot
()
.
find
(
"style"
)
.
find
(
"class[@name='st5']"
)
.
get
(
'stroke'
),
'#CED5CE'
)
self
.
assertEqual
(
fromTarget_xml_tree
.
getroot
()
.
find
(
"style"
)
.
find
(
"class[@name='st11']"
)
.
get
(
'font-family'
),
'Frutiger-Europeen'
)
isfile
(
target_file
)
and
os
.
remove
(
target_file
)
def
test_add_word
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
svg_tree
=
ET
.
parse
(
self
.
test_file
)
mylist
=
[{
'text'
:
'a'
},
{
'text'
:
'b'
},
{
'text'
:
'c'
}]
matrix
=
Matrix
(
self
.
matrix_string
)
for
dict
in
mylist
:
dict
[
'class'
]
=
'st22'
dict
[
'x'
]
=
matrix
.
add2X
(
0
)
dict
[
'y'
]
=
matrix
.
getY
()
target_file
=
self
.
test_dir
+
sep
+
'asdfasdf.xml'
page
=
PageCreator
(
target_file
)
sonderzeichen_list
,
letterspacing_list
,
style_dict
=
extractor
.
get_style
(
svg_tree
.
getroot
())
page
.
add_style
(
sonderzeichen_list
=
sonderzeichen_list
,
letterspacing_list
=
letterspacing_list
,
style_dict
=
style_dict
)
self
.
assertEqual
(
extractor
.
add_word
(
page
,
0
,
mylist
,
'%'
,
0
),
1
)
mylist
[
1
][
'text'
]
=
'A'
mylist
[
1
][
'class'
]
=
'st21'
mylist
[
1
][
'x'
]
=
matrix
.
add2X
(
1
)
self
.
assertEqual
(
extractor
.
add_word
(
page
,
0
,
mylist
,
'%'
,
0
),
2
)
page
.
update_and_attach_words2tree
()
self
.
assertEqual
(
page
.
page_tree
.
getroot
()
.
xpath
(
'//word[@id="1"]'
)[
0
]
.
get
(
'text'
),
'a'
)
self
.
assertEqual
(
page
.
page_tree
.
getroot
()
.
xpath
(
'//word[@id="2"]'
)[
0
]
.
get
(
'text'
),
'c'
)
self
.
assertEqual
(
page
.
page_tree
.
getroot
()
.
xpath
(
'//word[@id="2"]/transkription-position'
)[
0
]
.
get
(
'left'
),
'183.506'
)
self
.
assertEqual
(
page
.
page_tree
.
getroot
()
.
xpath
(
'//word[@id="2"]/transkription-position'
)[
0
]
.
get
(
'height'
),
'8.25'
)
def
test_extractor
(
self
):
extractor
=
extractWordPosition
.
Extractor
()
self
.
assertEqual
(
extractor
.
title
,
None
)
self
.
assertEqual
(
extractor
.
manuscript_file
,
None
)
self
.
assertEqual
(
extractor
.
xml_dir
,
'xml/'
)
self
.
assertEqual
(
extractor
.
manuscript_tree
,
None
)
def
test_write_title_to_manuscript_file
(
self
):
extractor
=
extractWordPosition
.
Extractor
(
xml_dir
=
self
.
test_dir
,
title
=
self
.
title
)
self
.
assertEqual
(
isfile
(
extractor
.
manuscript_file
),
True
)
extractor
=
extractWordPosition
.
Extractor
(
manuscript_file
=
extractor
.
manuscript_file
)
self
.
assertEqual
(
extractor
.
title
,
self
.
title
)
def
tearDown
(
self
):
isdir
(
self
.
test_dir
)
and
shutil
.
rmtree
(
self
.
test_dir
)
isfile
(
'{}/{}.xml'
.
format
(
'xml'
,
self
.
title
.
replace
(
' '
,
'_'
)))
and
os
.
remove
(
'{}/{}.xml'
.
format
(
'xml'
,
self
.
title
.
replace
(
' '
,
'_'
)))
if
__name__
==
"__main__"
:
unittest
.
main
()
Event Timeline
Log In to Comment