Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F61991758
test_word.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, May 10, 06:25
Size
13 KB
Mime Type
text/x-python
Expires
Sun, May 12, 06:25 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
17587800
Attached To
rNIETZSCHEPYTHON nietzsche-python
test_word.py
View Options
import
unittest
from
os
import
sep
,
path
import
lxml.etree
as
ET
import
sys
sys
.
path
.
append
(
'svgscripts'
)
from
datatypes.box
import
Box
from
datatypes.matrix
import
Matrix
import
datatypes.page
from
datatypes.path
import
Path
from
datatypes.positional_word_part
import
PositionalWordPart
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.transkription_position
import
TranskriptionPosition
from
datatypes.word
import
Word
,
execute_function_on_parts
from
datatypes.word_position
import
WordPosition
class
Page
:
def
__init__
(
self
):
self
.
svg_file
=
None
def
get_line_number
(
self
,
input
=
0
):
return
-
1
def
get_biggest_fontSize4styles
(
self
,
style_set
=
{}):
return
7
class
TestWord
(
unittest
.
TestCase
):
def
setUp
(
self
):
DATADIR
=
path
.
dirname
(
__file__
)
+
sep
+
'test_data'
self
.
test_file
=
DATADIR
+
sep
+
'N_VII_1_page009.xml'
self
.
pdf_xml
=
DATADIR
+
sep
+
'W_I_8_page125.xml'
self
.
pdf_xml_source
=
DATADIR
+
sep
+
'W_I_8_neu_125-01.svg'
self
.
word_part_objs
=
[{
'text'
:
'a'
},
{
'text'
:
'b'
},
{
'text'
:
'c'
}]
x
=
0
for
dict
in
self
.
word_part_objs
:
dict
[
'class'
]
=
'st22'
dict
[
'x'
]
=
x
dict
[
'y'
]
=
11
x
+=
1
mylist
=
{
'text'
:
'abc'
,
'id'
:
'0'
,
'line-number'
:
'2'
,
'deleted'
:
'true'
}
word_position
=
TranskriptionPosition
(
x
=
0
,
y
=
1
,
height
=
10
,
width
=
10
,
matrix
=
Matrix
(
'matrix(0.94 0.342 -0.342 0.94 0 0)'
))
self
.
transkription_positions
=
[
word_position
]
self
.
word_node
=
ET
.
Element
(
'word'
,
attrib
=
mylist
)
word_position
.
attach_object_to_tree
(
self
.
word_node
)
x
=
0
for
char
in
mylist
[
'text'
]:
ET
.
SubElement
(
self
.
word_node
,
'part'
,
attrib
=
{
'text'
:
char
,
'x'
:
str
(
x
),
'y'
:
'11'
,
'class'
:
'st22'
})
x
+=
1
def
test_Word_with_word_part_objs
(
self
):
word
=
Word
.
CREATE_WORD
(
word_part_objs
=
self
.
word_part_objs
,
height
=
10
,
endX
=
10
)
self
.
assertEqual
(
word
.
id
,
0
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
bottom
,
13
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
height
,
10
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
top
,
3
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
left
,
0
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
width
,
10
)
self
.
assertEqual
(
word
.
text
,
'abc'
)
def
test_Word_with_word_node
(
self
):
word
=
Word
.
create_cls
(
self
.
word_node
)
self
.
assertEqual
(
word
.
id
,
0
)
self
.
assertEqual
(
word
.
deleted
,
True
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
bottom
,
11
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
height
,
10
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
top
,
1
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
left
,
0
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
width
,
10
)
self
.
assertEqual
(
word
.
text
,
'abc'
)
self
.
assertEqual
(
word
.
line_number
,
2
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
transform
.
isRotationMatrix
(),
True
)
def
test_attach_word_to_tree
(
self
):
newWord
=
Word
.
CREATE_WORD
(
word_part_objs
=
self
.
word_part_objs
,
height
=
10
,
endX
=
10
)
empty_tree
=
ET
.
ElementTree
(
ET
.
Element
(
'page'
))
newWord
.
attach_word_to_tree
(
empty_tree
)
for
word_node
in
empty_tree
.
getroot
()
.
xpath
(
'//word'
):
word
=
Word
.
CREATE_WORD
(
word_node
=
word_node
)
self
.
assertEqual
(
word
.
id
,
0
)
self
.
assertEqual
(
word
.
deleted
,
False
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
bottom
,
13
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
height
,
10
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
top
,
3
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
left
,
0
)
self
.
assertEqual
(
word
.
transkription_positions
[
0
]
.
width
,
10
)
self
.
assertEqual
(
word
.
text
,
'abc'
)
def
test_split
(
self
):
page
=
Page
()
pwps
=
PositionalWordPart
.
CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST
(
page
,
self
.
word_part_objs
)
transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
pwps
)
word
=
Word
(
text
=
''
.
join
([
pwp
.
text
for
pwp
in
pwps
]),
transkription_positions
=
transkription_positions
)
previousWord
,
currentWord
,
nextWord
=
word
.
split
(
'b'
)
self
.
assertEqual
(
previousWord
.
id
,
0
)
self
.
assertEqual
(
previousWord
.
text
,
'a'
)
self
.
assertEqual
(
currentWord
.
id
,
1
)
self
.
assertEqual
(
nextWord
.
id
,
2
)
word
=
Word
(
text
=
''
.
join
([
pwp
.
text
for
pwp
in
pwps
]),
transkription_positions
=
transkription_positions
)
previousWord
,
currentWord
,
nextWord
=
word
.
split
(
'bc'
)
self
.
assertEqual
(
previousWord
.
id
,
0
)
self
.
assertEqual
(
previousWord
.
text
,
'a'
)
self
.
assertEqual
(
currentWord
.
id
,
1
)
word
=
Word
(
text
=
''
.
join
([
pwp
.
text
for
pwp
in
pwps
]),
transkription_positions
=
transkription_positions
)
previousWord
,
currentWord
,
nextWord
=
word
.
split
(
'ab'
,
start_id
=
10
)
self
.
assertEqual
(
currentWord
.
id
,
10
)
self
.
assertEqual
(
currentWord
.
text
,
'ab'
)
self
.
assertEqual
(
currentWord
.
transkription_positions
[
0
]
.
width
,
2.1
)
self
.
assertEqual
(
nextWord
.
id
,
11
)
self
.
assertEqual
(
nextWord
.
transkription_positions
[
0
]
.
width
,
5.2
)
word_part_objs
=
[{
'text'
:
'x'
,
'class'
:
'st22'
,
'x'
:
0
,
'y'
:
0
},
\
{
'text'
:
'Insofern'
,
'class'
:
'st22'
,
'x'
:
1
,
'y'
:
0
},
\
{
'text'
:
'x'
,
'class'
:
'st22'
,
'x'
:
10
,
'y'
:
0
}]
pwps
=
PositionalWordPart
.
CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST
(
page
,
word_part_objs
)
transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
pwps
)
word
=
Word
(
text
=
''
.
join
([
pwp
.
text
for
pwp
in
pwps
]),
transkription_positions
=
transkription_positions
)
with
self
.
assertWarns
(
Warning
):
previousWord
,
currentWord
,
nextWord
=
word
.
split
(
'Insofer'
)
word_part_objs
=
[{
'text'
:
'xInsofern'
,
'class'
:
'st22'
,
'x'
:
0
,
'y'
:
0
}]
pwps
=
PositionalWordPart
.
CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST
(
page
,
word_part_objs
)
transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
pwps
)
word
=
Word
(
text
=
''
.
join
([
pwp
.
text
for
pwp
in
pwps
]),
transkription_positions
=
transkription_positions
)
with
self
.
assertWarns
(
Warning
):
previousWord
,
currentWord
,
nextWord
=
word
.
split
(
'Insofern'
)
def
test_join
(
self
):
word
=
Word
.
CREATE_WORD
(
word_part_objs
=
self
.
word_part_objs
,
height
=
10
,
endX
=
10
)
other_word
=
Word
.
CREATE_WORD
(
word_part_objs
=
[{
'text'
:
'.'
,
'class'
:
'st22'
,
'x'
:
3
,
'y'
:
11
}])
word
.
join
(
other_word
)
self
.
assertEqual
(
word
.
text
,
'abc.'
)
other_word
=
Word
.
CREATE_WORD
(
word_part_objs
=
[{
'text'
:
'.'
,
'class'
:
'st22'
,
'x'
:
3
,
'y'
:
11
}])
word
.
join
(
other_word
,
append_at_end_of_new_word
=
False
)
self
.
assertEqual
(
word
.
text
,
'.abc.'
)
"""
tree = ET.ElementTree(ET.Element('page'))
word.attach_word_to_tree(tree)
print(ET.dump(tree.getroot()))
"""
def
test_get_semanticAndDataDict
(
self
):
word
=
Word
.
CREATE_WORD
(
word_node
=
self
.
word_node
)
empty_tree
=
ET
.
ElementTree
(
ET
.
Element
(
'page'
))
word
.
attach_word_to_tree
(
empty_tree
)
dictionary
=
Word
.
get_semantic_dictionary
()
#print(dictionary)
#for key in dictionary['properties'].keys():
# xpath = dictionary['properties'].get(key).get('xpath')\
# if type(dictionary['properties'].get(key)) is dict\
# else dictionary['properties'].get(key)[2]
# results = empty_tree.xpath(xpath)
# self.assertEqual(len(results), 1)
#print('{}: {}'.format(key, results[0]))
#self.assertEqual(word.get_data_dictionary()['body'].get('text'), 'abc')
def
test_simplify_transkription_positions
(
self
):
node_string
=
"""<transkription-position bottom="234.0" height="7.328" id="0" left="144.925" top="225.672" width="4.703" writing-process-id="0">
<word-part bottom="234.0" height="7.328" id="0" left="144.925" style-class="st11 st12" symbol-id="glyph6-7" text="S" top="226.672" width="4.703"/>
</transkription-position> """
nodeA
=
ET
.
fromstring
(
node_string
)
node_string
=
"""<transkription-position bottom="234.0" height="7.078" id="1" left="150.586" top="232.438" width="0.844" writing-process-id="0">
<word-part bottom="234.0" height="7.078" id="1" left="150.586" style-class="st11 st12" symbol-id="glyph6-2" text="i" top="226.922" width="0.844"/>
</transkription-position>
"""
nodeB
=
ET
.
fromstring
(
node_string
)
word
=
Word
(
text
=
"Si"
,
transkription_positions
=
[
TranskriptionPosition
(
node
=
nodeA
),
TranskriptionPosition
(
node
=
nodeB
)
])
self
.
assertEqual
(
len
(
word
.
transkription_positions
),
2
)
word
.
simplify_transkription_positions
()
self
.
assertEqual
(
len
(
word
.
transkription_positions
),
1
)
"""
tree = ET.ElementTree(ET.Element('page'))
word.attach_word_to_tree(tree)
print(ET.dump(tree.getroot()))
"""
def
test_partition
(
self
):
page
=
datatypes
.
page
.
Page
(
xml_source_file
=
self
.
test_file
)
word
=
page
.
words
[
67
]
self
.
assertEqual
(
word
.
belongs_to_multiple_writing_processes
(),
True
)
word
.
partition_according_to_writing_process_id
()
self
.
assertEqual
(
len
(
word
.
word_parts
),
3
)
self
.
assertEqual
(
word
.
belongs_to_multiple_writing_processes
(),
False
)
self
.
assertEqual
(
word
.
belongs_to_multiple_writing_processes
(
include_parts
=
True
),
True
)
empty_tree
=
ET
.
ElementTree
(
ET
.
Element
(
'page'
))
word_node
=
word
.
attach_word_to_tree
(
empty_tree
)
newWord
=
Word
.
create_cls
(
word_node
)
self
.
assertEqual
(
len
(
newWord
.
word_parts
),
3
)
self
.
assertEqual
(
newWord
.
line_number
,
-
1
)
#print(ET.dump(empty_tree.getroot()))
def
test_partition_deletion
(
self
):
page
=
datatypes
.
page
.
Page
(
xml_source_file
=
self
.
test_file
)
word
=
page
.
words
[
67
]
for
transkription_position
in
word
.
transkription_positions
:
transkription_position
.
deleted
=
transkription_position
.
writing_process_id
==
1
self
.
assertEqual
(
word
.
has_mixed_status
(
'deleted'
),
True
)
word
.
partition_according_to_deletion
()
self
.
assertEqual
(
len
(
word
.
word_parts
),
3
)
self
.
assertEqual
(
word
.
has_mixed_status
(
'deleted'
),
False
)
self
.
assertEqual
(
word
.
has_mixed_status
(
'deleted'
,
include_parts
=
True
),
True
)
page
=
datatypes
.
page
.
Page
(
xml_source_file
=
self
.
test_file
)
word
=
page
.
words
[
67
]
word
.
partition_according_to_writing_process_id
()
#print([(word.text, word.deleted) for word in word.word_parts])
word
.
word_parts
[
1
]
.
transkription_positions
[
1
]
.
deleted
=
True
word
.
partition_according_to_deletion
()
self
.
assertEqual
(
len
(
word
.
word_parts
),
4
)
#print([(word.text, word.deleted) for word in word.word_parts])
def
test_execute_function_on_parts
(
self
):
page
=
datatypes
.
page
.
Page
(
xml_source_file
=
self
.
test_file
)
word_parts
=
[
page
.
words
[
67
],
page
.
words
[
68
]
]
word_parts
,
none
=
execute_function_on_parts
(
word_parts
,
'partition_according_to_writing_process_id'
)
self
.
assertEqual
(
len
(
word_parts
)
==
4
,
True
)
def
test_process_word_boxes
(
self
):
page
=
datatypes
.
page
.
Page
(
xml_source_file
=
self
.
pdf_xml
)
page
.
source
=
self
.
pdf_xml_source
for
word
in
page
.
words
:
word
.
partition_according_to_writing_process_id
()
tr
=
TranskriptionField
(
page
.
source
)
box_path_d
=
[
'M 598.11,626.565 L 603.557,626.565 L 603.557,632.565 L 598.11,632.565 L 598.11,626.565'
,
\
'M 557.443,683.44 L 574.182,683.44 L 574.182,694.815 L 557.443,694.815 L 557.443,683.44'
,
\
'M 404.193,659.565 L 407.80699999999996,659.565 L 407.80699999999996,668.94 L 404.193,668.94 L 404.193,659.565'
,
\
'M 587.932,634.065 L 598.318,634.065 L 598.318,643.19 L 587.932,643.19 L 587.932,634.065'
,
\
'M 570.443,221.315 L 576.557,221.315 L 576.557,230.065 L 570.443,230.065 L 570.443,221.315'
]
box_paths
=
[
Box
(
d_string
=
d_string
,
earlier_text
=
'test'
)
for
d_string
in
box_path_d
]
indices
=
[
30
,
276
,
287
,
295
,
319
]
for
index
in
indices
:
later_word
=
page
.
words
[
index
]
.
process_boxes
(
box_paths
,
tr_xmin
=
tr
.
xmin
,
tr_ymin
=
tr
.
ymin
)
self
.
assertEqual
(
later_word
.
earlier_version
is
not
None
,
True
)
def
test_split_according_to_status
(
self
):
page
=
datatypes
.
page
.
Page
(
xml_source_file
=
self
.
test_file
)
word
=
page
.
words
[
67
]
for
transkription_position
in
word
.
transkription_positions
:
transkription_position
.
text
=
'asdf'
\
if
transkription_position
.
writing_process_id
==
1
\
else
word
.
text
self
.
assertEqual
(
word
.
has_mixed_status
(
'text'
),
True
)
new_words
=
word
.
split_according_to_status
(
'text'
)
self
.
assertEqual
(
len
(
new_words
)
>
1
,
True
)
self
.
assertEqual
(
new_words
[
0
]
.
id
,
word
.
id
)
self
.
assertEqual
(
new_words
[
0
]
.
deleted
,
word
.
deleted
)
self
.
assertEqual
(
new_words
[
1
]
.
id
,
word
.
id
+
1
)
#print([ word.text for word in new_words ])
if
__name__
==
"__main__"
:
unittest
.
main
()
Event Timeline
Log In to Comment