Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F59098916
word.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Apr 19, 18:52
Size
49 KB
Mime Type
text/x-python
Expires
Sun, Apr 21, 18:52 (2 d)
Engine
blob
Format
Raw Data
Handle
17126052
Attached To
rNIETZSCHEPYTHON nietzsche-python
word.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
import
copy
import
inspect
from
lxml
import
etree
as
ET
from
operator
import
attrgetter
import
re
import
string
import
sys
import
warnings
from
.box
import
Box
from
.editor_comment
import
EditorComment
from
.matrix
import
Matrix
from
.path
import
Path
from
.simple_word
import
SimpleWord
from
.style
import
Style
from
.word_deletion_path
import
WordDeletionPath
from
.word_position
import
WordPosition
from
.transkription_position
import
TranskriptionPosition
from
.writing_process
import
WritingProcess
SINGLE_PUNCTUATION_PATTERN
=
r"^[{}–]$"
.
format
(
string
.
punctuation
)
def
execute_function_on_parts
(
word_parts
,
func_name
):
"""Execute function on parts and add those parts instead of original word to word_parts.
:return: new word_parts, output from func
"""
copy_parts
=
word_parts
[:]
for
word
in
word_parts
:
output
=
eval
(
'word.{0}()'
.
format
(
func_name
))
if
len
(
word
.
word_parts
)
>
0
:
for
part_word
in
word
.
word_parts
:
copy_parts
.
insert
(
copy_parts
.
index
(
word
),
part_word
)
copy_parts
.
remove
(
word
)
word
.
word_parts
=
[]
return
copy_parts
,
output
def
update_transkription_position_ids
(
word
):
"""Update transkription_position' ids according to index.
"""
word_part_ids
=
[
wp
.
id
for
wp
in
word
.
word_parts
]
if
len
(
word_part_ids
)
!=
len
(
set
(
word_part_ids
)):
for
id
,
wp
in
enumerate
(
word
.
word_parts
):
wp
.
id
=
id
for
index
,
transkription_position
in
enumerate
(
sorted
(
word
.
transkription_positions
,
key
=
attrgetter
(
'left'
))):
transkription_position
.
id
=
index
transkription_position
.
has_box
=
None
transkription_position
.
deleted
=
False
class
Word
(
SimpleWord
):
"""
This class represents a word.
"""
COPY_PROPERTY_KEY
=
[
'line_number'
,
'deleted'
,
'writing_process_id'
]
APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS
=
{
'style'
:
'styles'
}
DATA
=
'debug-data'
RDFS_SUBCLASSOF_LIST
=
[
'http://www.e-editiones.ch/ontology/text#HandwrittenText'
]
XML_TAG
=
'word'
XML_EARLIER_VERSION
=
'earlier-version'
XML_OVERWRITES
=
'overwrites'
XML_CORRECTION_DICT
=
{
'isClarificationOfWord'
:
'clarifiesWord'
,
\
'isDeletionOfWord'
:
'deletesEarlierPart'
,
\
'isExtensionOfWord'
:
'extendsEarlierVersion'
,
\
'isTransformationOfWord'
:
'transformsEarlierPart'
}
def
__init__
(
self
,
id
=
0
,
text
=
''
,
line_number
=-
1
,
deleted
=
False
,
transkription_positions
=
None
,
faksimile_positions
=
None
,
word_part_objs
=
None
,
word_parts
=
None
,
writing_process_id
=-
1
,
earlier_version
=
None
,
box_paths
=
None
,
styles
=
None
):
super
(
Word
,
self
)
.
__init__
(
id
=
id
,
text
=
text
,
line_number
=
line_number
,
transkription_positions
=
transkription_positions
,
\
faksimile_positions
=
faksimile_positions
)
self
.
corrections
=
[]
self
.
deleted
=
deleted
self
.
deletion_paths
=
[]
self
.
debug_container
=
{}
self
.
debug_msg
=
None
self
.
earlier_version
=
earlier_version
self
.
edited_text
=
None
self
.
editor_comment
=
None
self
.
isClarificationOfWord
=
None
self
.
isDeletionOfWord
=
None
self
.
isExtensionOfWord
=
None
self
.
isTransformationOfWord
=
None
if
len
(
self
.
text
)
==
0
and
len
(
''
.
join
([
tp
.
get_text
()
for
tp
in
self
.
transkription_positions
if
type
(
tp
)
==
TranskriptionPosition
]))
>
0
:
self
.
text
=
''
.
join
([
tp
.
get_text
()
for
tp
in
self
.
transkription_positions
])
self
.
overwrites_word
=
None
self
.
styles
=
styles
\
if
styles
is
not
None
\
else
[]
self
.
verified
=
None
self
.
writing_process_id
=
writing_process_id
self
.
writing_processes
=
[]
self
.
word_insertion_mark
=
None
self
.
word_box
=
None
self
.
word_parts
=
word_parts
if
word_parts
is
not
None
else
[]
self
.
word_part_objs
=
word_part_objs
if
word_part_objs
is
not
None
else
[]
def
add_deletion_paths
(
self
,
deletion_paths
,
tr_xmin
=
0.0
,
tr_ymin
=
0.0
):
"""Add a word deletion path to word.
"""
if
len
(
self
.
word_parts
)
>
0
:
for
part
in
self
.
word_parts
:
part
.
add_deletion_paths
(
deletion_paths
,
tr_xmin
=
tr_xmin
,
tr_ymin
=
tr_ymin
)
elif
self
.
deleted
and
len
(
self
.
transkription_positions
)
>
0
:
word_path
=
Path
.
create_path_from_transkription_position
(
self
.
transkription_positions
[
0
],
\
tr_xmin
=
tr_xmin
,
tr_ymin
=
tr_ymin
)
self
.
deletion_paths
=
[
deletion_path
for
deletion_path
in
deletion_paths
\
if
do_paths_intersect_saveMode
(
deletion_path
,
word_path
)
]
def
attach_word_to_tree
(
self
,
target_tree
):
"""Attaches word to tree target_tree.
"""
word_node
=
super
(
Word
,
self
)
.
attach_word_to_tree
(
target_tree
)
if
self
.
deleted
is
not
None
:
word_node
.
set
(
'deleted'
,
str
(
self
.
deleted
)
.
lower
())
if
self
.
verified
is
not
None
:
word_node
.
set
(
'verified'
,
str
(
self
.
verified
)
.
lower
())
if
self
.
edited_text
is
not
None
:
word_node
.
set
(
'edited-text'
,
self
.
edited_text
)
if
self
.
editor_comment
is
not
None
:
self
.
editor_comment
.
attach_object_to_tree
(
word_node
)
if
self
.
writing_process_id
>
-
1
:
word_node
.
set
(
'writing-process-id'
,
str
(
self
.
writing_process_id
))
for
index
,
word_part
in
enumerate
(
self
.
word_parts
):
word_part
.
id
=
index
word_part
.
attach_word_to_tree
(
word_node
)
if
self
.
earlier_version
is
not
None
:
earlier_node
=
ET
.
SubElement
(
word_node
,
self
.
XML_EARLIER_VERSION
)
self
.
earlier_version
.
attach_word_to_tree
(
earlier_node
)
if
self
.
overwrites_word
is
not
None
\
and
len
(
self
.
overwrites_word
.
transkription_positions
)
>
0
:
overwrite_node
=
ET
.
SubElement
(
word_node
,
self
.
XML_OVERWRITES
)
self
.
overwrites_word
.
attach_word_to_tree
(
overwrite_node
)
if
self
.
word_box
is
not
None
:
self
.
word_box
.
attach_object_to_tree
(
word_node
)
if
len
(
self
.
corrections
)
>
0
:
word_node
.
set
(
'corrections'
,
' '
.
join
(
set
([
str
(
word
.
id
)
for
word
in
self
.
corrections
])))
for
key
in
self
.
XML_CORRECTION_DICT
.
keys
():
if
self
.
__dict__
[
key
]
is
not
None
:
word_node
.
set
(
self
.
XML_CORRECTION_DICT
[
key
],
'true'
)
return
word_node
def
belongs_to_multiple_writing_processes
(
self
,
include_parts
=
False
):
"""Returns true if transkription_positions belong to different WritingProcesses.
"""
if
len
(
self
.
word_parts
)
>
0
and
include_parts
:
return
len
(
set
(
word
.
writing_process_id
for
word
in
self
.
word_parts
))
>
1
return
len
(
set
(
tp
.
writing_process_id
for
tp
in
self
.
transkription_positions
))
>
1
def
set_parent_word_writing_process_id
(
self
):
"""Set writing_process_id for parent word.
"""
ids
=
set
(
word
.
transkription_positions
[
0
]
.
style
for
word
in
self
.
word_parts
\
if
len
(
word
.
transkription_positions
)
>
0
and
word
.
transkription_positions
[
0
]
.
style
is
not
None
)
if
len
(
ids
)
>
1
:
self
.
writing_process_id
=
max
([
style
.
writing_process_id
for
style
in
ids
])
if
len
(
set
(
word
.
transkription_positions
[
0
]
.
style
.
create_a_copy_wo_writing_process_id
()
\
for
word
in
self
.
word_parts
\
if
len
(
word
.
transkription_positions
)
>
0
and
word
.
transkription_positions
[
0
]
.
style
is
not
None
))
\
>
1
:
self
.
writing_process_id
+=
1
@classmethod
def
create_cls
(
cls
,
word_node
):
"""Creates a word from a (lxml.Element) node.
[:return:] Word
"""
cls
=
super
(
Word
,
cls
)
.
create_cls
(
word_node
)
cls
.
writing_process_id
=
int
(
word_node
.
get
(
'writing-process-id'
))
if
bool
(
word_node
.
get
(
'writing-process-id'
))
else
-
1
cls
.
split_strings
=
None
cls
.
join_string
=
word_node
.
get
(
'join'
)
if
bool
(
word_node
.
get
(
'split'
)):
cls
.
split_strings
=
word_node
.
get
(
'split'
)
.
split
(
' '
)
if
''
.
join
(
cls
.
split_strings
)
!=
cls
.
text
:
error_msg
=
'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!
\n
'
.
\
format
(
word_node
.
getroottree
()
.
docinfo
.
URL
,
str
(
cls
.
id
))
\
+
'Split attributes: "{0}".
\n
'
.
format
(
' '
.
join
(
cls
.
split_strings
))
\
+
'Text attribute: "{0}".
\n
'
.
format
(
cls
.
text
)
raise
Exception
(
error_msg
)
cls
.
verified
=
word_node
.
get
(
'verified'
)
==
'true'
\
if
bool
(
word_node
.
get
(
'verified'
))
else
None
cls
.
deleted
=
word_node
.
get
(
'deleted'
)
==
'true'
\
if
bool
(
word_node
.
get
(
'deleted'
))
else
None
cls
.
edited_text
=
word_node
.
get
(
'edited-text'
)
cls
.
editor_comment
=
[
EditorComment
.
create_cls_from_node
(
node
)
for
node
in
word_node
.
xpath
(
'./'
+
EditorComment
.
XML_TAG
)
][
0
]
\
if
len
([
node
for
node
in
word_node
.
xpath
(
'./'
+
EditorComment
.
XML_TAG
)
])
>
0
else
None
cls
.
word_parts
=
[
cls
.
create_cls
(
node
)
for
node
in
word_node
.
xpath
(
'./'
+
cls
.
XML_TAG
)
]
if
bool
(
word_node
.
get
(
'corrections'
)):
for
index
in
[
int
(
i
)
for
i
in
word_node
.
get
(
'corrections'
)
.
split
(
' '
)
]:
if
index
<
len
(
cls
.
word_parts
):
cls
.
corrections
.
append
(
cls
.
word_parts
[
index
])
cls
.
earlier_version
=
None
if
len
(
word_node
.
xpath
(
'./'
+
cls
.
XML_EARLIER_VERSION
+
'/'
+
cls
.
XML_TAG
))
>
0
:
cls
.
earlier_version
=
[
cls
.
create_cls
(
node
)
for
node
in
word_node
.
xpath
(
'./'
+
cls
.
XML_EARLIER_VERSION
+
'/'
+
cls
.
XML_TAG
)
][
0
]
for
key_value
in
cls
.
XML_CORRECTION_DICT
.
values
():
if
word_node
.
get
(
key_value
)
==
'true'
:
cls
.
__dict__
[
key_value
]
=
True
if
cls
.
earlier_version
is
not
None
:
for
word_part
in
cls
.
word_parts
:
for
key
in
[
key
for
key
,
value
in
cls
.
XML_CORRECTION_DICT
.
items
()
if
value
.
endswith
(
'Part'
)
]:
if
cls
.
XML_CORRECTION_DICT
[
key
]
in
word_part
.
__dict__
.
keys
()
and
word_part
.
__dict__
[
cls
.
XML_CORRECTION_DICT
[
key
]]
\
and
len
(
cls
.
word_parts
)
<=
len
(
cls
.
earlier_version
.
word_parts
):
try
:
word_part
.
__dict__
[
key
]
=
cls
.
earlier_version
.
word_parts
[
word_part
.
id
]
except
Exception
:
msg
=
f
'{cls.id} {cls.text}: {word_part.id}'
raise
Exception
(
msg
)
for
key
in
[
key
for
key
,
value
in
cls
.
XML_CORRECTION_DICT
.
items
()
if
value
.
endswith
(
'EarlierVersion'
)
]:
if
cls
.
XML_CORRECTION_DICT
[
key
]
in
word_part
.
__dict__
.
keys
()
and
word_part
.
__dict__
[
cls
.
XML_CORRECTION_DICT
[
key
]]:
word_part
.
__dict__
[
key
]
=
cls
.
earlier_version
for
key
in
[
key
for
key
,
value
in
cls
.
XML_CORRECTION_DICT
.
items
()
if
value
.
endswith
(
'Word'
)
]:
if
cls
.
XML_CORRECTION_DICT
[
key
]
in
word_part
.
__dict__
.
keys
()
and
word_part
.
__dict__
[
cls
.
XML_CORRECTION_DICT
[
key
]]:
word_part
.
__dict__
[
key
]
=
cls
cls
.
overwrites_word
=
[
cls
.
create_cls
(
node
)
for
node
in
word_node
.
xpath
(
'./'
+
cls
.
XML_OVERWRITES
+
'/'
+
cls
.
XML_TAG
)][
0
]
\
if
len
(
word_node
.
xpath
(
'./'
+
cls
.
XML_OVERWRITES
+
'/'
+
cls
.
XML_TAG
))
>
0
\
else
None
cls
.
word_box
=
[
Box
(
node
=
node
)
for
node
in
word_node
.
xpath
(
'./'
+
Box
.
XML_TAG
)
][
0
]
\
if
len
(
word_node
.
xpath
(
'./'
+
Box
.
XML_TAG
))
>
0
\
else
None
return
cls
@classmethod
def
join_words
(
cls
,
list_of_words
):
"""Creates a word from a list of words.
[:return:] Word
"""
if
len
(
list_of_words
)
>
1
:
deleted
=
True
in
[
word
.
deleted
for
word
in
list_of_words
]
\
and
len
(
set
([
word
.
deleted
for
word
in
list_of_words
]))
==
1
line_number
=
list_of_words
[
0
]
.
line_number
\
if
len
(
set
([
word
.
line_number
for
word
in
list_of_words
]))
==
1
\
else
-
1
for
word
in
list_of_words
:
if
len
(
word
.
word_parts
)
>
0
:
index
=
list_of_words
.
index
(
word
)
list_of_words
.
remove
(
word
)
for
part_word
in
reversed
(
word
.
word_parts
):
list_of_words
.
insert
(
index
,
part_word
)
new_word
=
cls
(
id
=
list_of_words
[
0
]
.
id
,
text
=
''
.
join
([
word
.
text
for
word
in
list_of_words
]),
\
line_number
=
line_number
,
deleted
=
deleted
,
word_parts
=
list_of_words
)
if
True
in
[
word
.
text
.
endswith
(
'-'
)
or
word
.
text
.
endswith
(
'='
)
for
word
in
new_word
.
word_parts
[:
-
1
]]:
change_text
=
[
word
.
text
for
word
in
new_word
.
word_parts
[:
-
1
]
if
word
.
text
.
endswith
(
'-'
)
or
word
.
text
.
endswith
(
'='
)
][
0
]
new_word
.
edited_text
=
new_word
.
text
.
replace
(
change_text
,
change_text
[:
-
1
])
for
id
,
word
in
enumerate
(
new_word
.
word_parts
):
word
.
id
=
id
return
new_word
if
len
(
list_of_words
)
>
0
:
return
list_of_words
[
0
]
else
:
return
None
def
create_earlier_version
(
self
,
root_word
=
None
,
id
=
0
):
"""Create an earlier version of word.
"""
if
root_word
is
None
:
root_word
=
self
root_word
.
set_parent_word_writing_process_id
()
word_parts
=
[]
non_single_punctuation_word_parts
=
[
word_part
for
word_part
in
self
.
word_parts
\
if
not
re
.
match
(
SINGLE_PUNCTUATION_PATTERN
,
word_part
.
text
)
]
non_single_punctuation_word_parts_length
=
len
(
non_single_punctuation_word_parts
)
if
non_single_punctuation_word_parts_length
>
0
\
and
len
([
word_part
for
word_part
in
non_single_punctuation_word_parts
\
if
word_part
.
deleted
])
\
==
non_single_punctuation_word_parts_length
:
self
.
deleted
=
True
for
word_part
in
non_single_punctuation_word_parts
:
word_part
.
deleted
=
False
for
id
,
word_part
in
enumerate
(
self
.
word_parts
):
earlierWordPart
=
word_part
.
create_earlier_version
(
root_word
=
root_word
,
id
=
id
)
if
word_part
.
deleted
:
word_part
.
isDeletionOfWord
=
earlierWordPart
word_parts
.
append
(
earlierWordPart
)
if
word_part
not
in
self
.
corrections
:
self
.
corrections
.
append
(
word_part
)
elif
word_part
.
overwrites_word
is
not
None
\
and
((
len
(
word_part
.
transkription_positions
)
>
0
\
and
word_part
.
overwrites_word
.
transkription_positions
[
0
]
.
style
is
not
None
\
and
word_part
.
transkription_positions
[
0
]
.
style
is
not
None
\
and
word_part
.
transkription_positions
[
0
]
.
style
\
!=
word_part
.
overwrites_word
.
transkription_positions
[
0
]
.
style
)
or
word_part
.
word_box
.
earlier_version
):
word_part
.
overwrites_word
.
id
=
word_part
.
id
word_parts
.
append
(
word_part
.
overwrites_word
)
word_part
.
isTransformationOfWord
=
word_part
.
overwrites_word
#print(f'transform: {self.text}')
if
word_part
not
in
self
.
corrections
:
self
.
corrections
.
append
(
word_part
)
elif
root_word
.
writing_process_id
>
-
1
\
and
(
len
(
word_part
.
transkription_positions
)
>
0
\
and
word_part
.
transkription_positions
[
0
]
.
style
is
not
None
\
and
word_part
.
transkription_positions
[
0
]
.
style
.
writing_process_id
\
==
root_word
.
writing_process_id
):
word_part
.
extendsEarlierVersion
=
True
#print('extends')
if
word_part
not
in
self
.
corrections
:
self
.
corrections
.
append
(
word_part
)
else
:
if
word_part
.
deleted
:
word_part
.
isDeletionOfWord
=
earlierWordPart
word_parts
.
append
(
earlierWordPart
)
if
word_part
not
in
self
.
corrections
:
self
.
corrections
.
append
(
word_part
)
else
:
#print(f'default: {self.text}')
word_parts
.
append
(
earlierWordPart
)
text
=
''
.
join
([
word
.
text
for
word
in
word_parts
])
\
if
len
(
word_parts
)
>
0
\
else
self
.
text
if
len
(
word_parts
)
==
1
:
self
.
transkription_positions
+=
word_parts
[
0
]
.
transkription_positions
self
.
faksimile_positions
+=
word_parts
[
0
]
.
faksimile_positions
word_parts
=
[]
new_transkription_positions
=
copy
.
deepcopy
(
self
.
transkription_positions
)
if
len
(
self
.
transkription_positions
)
>
0
\
and
self
.
transkription_positions
[
0
]
.
style
is
not
None
:
writing_process_id
=
self
.
transkription_positions
[
0
]
.
style
.
writing_process_id
for
new_tp
in
new_transkription_positions
:
new_tp
.
style
.
writing_process_id
=
writing_process_id
return
Word
(
id
=
id
,
text
=
text
,
transkription_positions
=
new_transkription_positions
,
\
faksimile_positions
=
self
.
faksimile_positions
,
line_number
=
self
.
line_number
,
\
word_parts
=
word_parts
)
def
create_correction_history
(
self
,
page
=
None
,
box_style
=
None
):
"""Create correction history.
"""
if
self
.
word_box
is
not
None
:
manuscript
=
self
.
transkription_positions
[
0
]
.
style
.
manuscript
\
if
len
(
self
.
transkription_positions
)
>
0
\
and
self
.
transkription_positions
[
0
]
.
style
is
not
None
\
else
None
style
=
Style
()
if
box_style
is
not
None
:
style
=
box_style
if
page
is
not
None
:
style
=
Style
.
create_cls
(
page
,
self
.
word_box
.
text_style_class
,
manuscript
=
manuscript
)
for
font_key
in
[
font_key
for
font_key
in
self
.
word_box
.
text_style_class
.
split
(
' '
)
if
font_key
in
page
.
fontsizekey2stage_mapping
.
keys
()
]:
style
.
writing_process_id
=
page
.
fontsizekey2stage_mapping
.
get
(
font_key
)
transkription_positions
=
TranskriptionPosition
.
copy_list_of_cls
(
self
.
transkription_positions
)
for
transkription_position
in
transkription_positions
:
transkription_position
.
style
=
style
self
.
overwrites_word
=
Word
(
text
=
self
.
word_box
.
earlier_text
,
transkription_positions
=
transkription_positions
,
\
line_number
=
self
.
line_number
)
for
word_part
in
self
.
word_parts
:
word_part
.
create_correction_history
(
page
=
page
,
box_style
=
box_style
)
if
len
(
self
.
word_parts
)
>
0
:
earlier_version
=
self
.
create_earlier_version
()
extending_words
=
self
.
_get_parts_with_property_key
(
'extendsEarlierVersion'
)
if
len
(
extending_words
)
>
0
:
for
word
in
extending_words
:
word
.
isExtensionOfWord
=
earlier_version
if
self
.
has_mixed_status
(
'deleted'
,
include_parts
=
True
):
self
.
edited_text
=
''
.
join
([
word
.
text
for
word
in
self
.
word_parts
if
not
word
.
deleted
])
if
len
(
self
.
corrections
)
>
0
:
self
.
earlier_version
=
earlier_version
@staticmethod
def
CREATE_WORD
(
word_node
=
None
,
page
=
None
,
word_part_objs
=
[],
id
=
0
,
height
=
0
,
endX
=
0
,
endSign
=
None
,
matrix
=
None
,
line_number
=-
1
,
debug_msg
=
None
):
"""Creates a word from a (lxml.Element) node or word_part_objs.
[:return:] Word
"""
if
word_node
is
not
None
:
# init word from xml node
id
=
int
(
word_node
.
get
(
'id'
))
line_number
=
int
(
word_node
.
get
(
'line-number'
))
if
bool
(
word_node
.
get
(
'line-number'
))
else
line_number
text
=
word_node
.
get
(
'text'
)
deleted
=
bool
(
word_node
.
get
(
'deleted'
))
and
word_node
.
get
(
'deleted'
)
==
'true'
transkription_positions
=
[
TranskriptionPosition
(
node
=
node
)
for
node
in
word_node
.
findall
(
'.//'
+
WordPosition
.
TRANSKRIPTION
)
]
faksimile_positions
=
[
WordPosition
(
node
=
node
)
for
node
in
word_node
.
findall
(
'.//'
+
WordPosition
.
FAKSIMILE
)
]
word_part_objs
=
[
item
.
attrib
for
item
in
word_node
.
findall
(
'.//'
+
Word
.
DATA
+
'/part'
)]
\
if
len
(
word_node
.
findall
(
'.//'
+
Word
.
DATA
))
>
0
\
else
[
item
.
attrib
for
item
in
word_node
.
findall
(
'.//part'
)]
return
Word
(
id
=
id
,
text
=
text
,
deleted
=
deleted
,
line_number
=
line_number
,
transkription_positions
=
transkription_positions
,
\
faksimile_positions
=
faksimile_positions
,
word_part_objs
=
word_part_objs
)
elif
len
(
word_part_objs
)
>
0
:
# init word from word_part_obj that has been extracted from svg file
WIDTH
=
5
TOPCORRECTION
=
2.0
FONTWIDTHFACTOR
=
0.7
# factor that multiplies lastCharFontSize
height
=
height
x
=
round
(
float
(
word_part_objs
[
0
][
'x'
]),
3
)
if
(
page
is
not
None
and
bool
(
page
.
style_dict
)):
HEIGHT_FACTOR
=
1.1
# factor that multiplies biggest_font_size -> height
style_set
=
set
(
' '
.
join
(
set
(
dict
[
'class'
]
for
dict
in
word_part_objs
))
.
split
(
' '
))
biggest_font_size
=
page
.
get_biggest_fontSize4styles
(
style_set
=
style_set
)
height
=
round
(
biggest_font_size
*
HEIGHT_FACTOR
+
HEIGHT_FACTOR
/
biggest_font_size
,
3
)
TOPCORRECTION
=
1
+
HEIGHT_FACTOR
/
biggest_font_size
if
endSign
is
not
None
and
'%'
in
endSign
:
lastCharFontSizeList
=
[
float
(
page
.
style_dict
[
key
][
'font-size'
]
.
replace
(
'px'
,
''
))
\
for
key
in
word_part_objs
[
len
(
word_part_objs
)
-
1
][
'class'
]
.
split
(
' '
)
\
if
bool
(
page
.
style_dict
[
key
]
.
get
(
'font-size'
))]
lastCharFontSize
=
lastCharFontSizeList
[
0
]
if
len
(
lastCharFontSizeList
)
>
0
else
1
endX
=
float
(
endX
)
+
lastCharFontSize
*
FONTWIDTHFACTOR
elif
endSign
is
not
None
and
'%'
in
endSign
:
endX
=
float
(
endX
)
+
WIDTH
bottom
=
round
(
float
(
word_part_objs
[
0
][
'y'
]),
3
)
y
=
round
(
bottom
-
height
+
TOPCORRECTION
,
3
)
width
=
round
(
float
(
endX
)
-
x
,
3
)
transkription_positions
=
[
WordPosition
(
height
=
height
,
width
=
width
,
x
=
x
,
y
=
y
,
matrix
=
matrix
,
tag
=
WordPosition
.
TRANSKRIPTION
)
]
text
=
''
.
join
([
dict
[
'text'
]
for
dict
in
word_part_objs
])
line_number
=
page
.
get_line_number
(
(
y
+
bottom
)
/
2
)
if
page
is
not
None
else
line_number
word
=
Word
(
id
=
id
,
text
=
text
,
line_number
=
line_number
,
transkription_positions
=
transkription_positions
,
word_part_objs
=
word_part_objs
)
word
.
debug_msg
=
debug_msg
return
word
else
:
error_msg
=
'word_node has not been defined'
if
(
word_node
is
None
)
else
'word_part_objs is empty'
raise
Exception
(
'Error: {}'
.
format
(
error_msg
))
@classmethod
def
get_semantic_dictionary
(
cls
):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary
=
super
(
Word
,
cls
)
.
get_semantic_dictionary
()
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'styles'
,
Style
,
\
cardinality
=
1
,
cardinality_restriction
=
'minCardinality'
,
\
name
=
'wordHasStyle'
,
label
=
'word has style'
,
comment
=
'Word has an appearance that is characterized by this style.'
))
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'corrections'
,
Word
,
\
name
=
'wordHasCorrection'
,
label
=
'word has corrections'
,
comment
=
'Word has a correction made by the author.'
))
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'deletion_paths'
,
WordDeletionPath
,
\
name
=
'wordIsDeletedByPath'
,
label
=
'word has been deleted with a deletion path'
,
\
comment
=
'Word has been deleted by the author using a deletion path.'
))
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'editor_comment'
,
EditorComment
,
\
name
=
'wordHasEditorComment'
,
label
=
'word has a comment by the editors'
,
comment
=
'Word has been commented by the editors.'
))
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'earlier_version'
,
Word
,
\
name
=
'wordHasEarlierVersion'
,
label
=
'word has an earlier version'
,
comment
=
'There is a earlier version of this word.'
))
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'edited_text'
,
str
,
\
name
=
'hasEditedText'
,
label
=
'word has an edited text'
,
comment
=
'Word has a text that is edited automatically by removing deleted parts or hyphens.'
))
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'isClarificationOfWord'
,
Word
,
\
name
=
'isClarificationOfWord'
,
label
=
'word is a clarification of word'
,
\
comment
=
'The author has used this part of the word in order to clarify the appearance of that word.'
))
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'isDeletionOfWord'
,
Word
,
\
name
=
'isDeletionOfWord'
,
label
=
'word is a deletion of word'
,
\
comment
=
'The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.'
))
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'isExtensionOfWord'
,
Word
,
\
name
=
'isExtensionOfWord'
,
label
=
'word is a extension of word'
,
\
comment
=
'The author has used this part of a word in order to extend an earlier version of this word.'
))
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'isTransformationOfWord'
,
Word
,
\
name
=
'isTransformationOfWord'
,
label
=
'word is a transformation of word'
,
\
comment
=
'The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.'
))
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'overwrites_word'
,
Word
,
\
name
=
'overwritesWord'
,
label
=
'word overwrites word'
,
\
comment
=
'The author has used this word in order to overwrite that word.'
))
# This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING,
# cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class.
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
(
cls
.
create_semantic_property_dictionary
(
'word_parts'
,
list
,
\
name
=
'wordHasWordParts'
,
label
=
'word has word parts'
,
comment
=
'Word consists of a list of words.'
,
\
subPropertyOf
=
cls
.
HAS_HOMOTYPIC_PARTS_URL_STRING
))
super_property_dictionary
=
cls
.
create_semantic_property_dictionary
(
cls
.
SUPER_PROPERTY
,
Word
,
\
name
=
'isCorrectionOfWord'
,
label
=
'word is a correction of word'
,
\
comment
=
'The author has used this word in order to correct that word.'
)
for
key
in
cls
.
XML_CORRECTION_DICT
.
keys
():
correction_dict
=
dictionary
[
cls
.
PROPERTIES_KEY
]
.
get
(
key
)
correction_dict
.
update
(
super_property_dictionary
)
dictionary
[
cls
.
PROPERTIES_KEY
]
.
update
({
key
:
correction_dict
})
return
cls
.
return_dictionary_after_updating_super_classes
(
dictionary
)
def
has_mixed_status
(
self
,
property_key
,
include_parts
=
False
,
concerns_word
=
True
):
"""Returns true if transkription_positions have mixed status concerning the property_key in their __dict__.
"""
if
False
in
set
(
property_key
in
tp
.
__dict__
.
keys
()
for
tp
in
self
.
transkription_positions
):
return
False
if
len
(
self
.
word_parts
)
>
0
and
include_parts
:
if
concerns_word
:
if
False
in
set
(
property_key
in
word
.
__dict__
.
keys
()
for
word
in
self
.
word_parts
):
return
False
return
len
(
set
(
word
.
__dict__
[
property_key
]
for
word
in
self
.
word_parts
))
>
1
else
:
return
len
(
set
(
word
.
transkription_positions
[
0
]
.
__dict__
[
property_key
]
for
word
in
self
.
word_parts
\
if
len
(
word
.
transkription_positions
)
>
0
and
property_key
in
word
.
transkription_positions
[
0
]
.
__dict__
.
keys
()))
>
1
return
len
(
set
(
tp
.
__dict__
[
property_key
]
for
tp
in
self
.
transkription_positions
))
>
1
def
init_word
(
self
,
page
):
"""Initialize word with objects from page.
"""
super
(
Word
,
self
)
.
init_word
(
page
)
if
self
.
writing_process_id
>
-
1
:
self
.
writing_processes
+=
[
wp
for
wp
in
page
.
writing_processes
if
wp
.
id
==
self
.
writing_process_id
]
writing_processes
=
self
.
writing_processes
for
word_part
in
self
.
word_parts
:
word_part
.
init_word
(
page
)
self
.
lines
+=
word_part
.
lines
self
.
writing_processes
+=
word_part
.
writing_processes
self
.
lines
=
[
line
for
line
in
set
(
self
.
lines
)
]
self
.
writing_processes
=
[
wp
for
wp
in
set
(
self
.
writing_processes
)]
if
self
.
overwrites_word
is
not
None
:
self
.
overwrites_word
.
init_word
(
page
)
if
self
.
earlier_version
is
not
None
:
if
self
.
earlier_version
.
writing_process_id
==
-
1
:
self
.
earlier_version
.
writing_process_id
=
self
.
writing_process_id
-
1
if
self
.
earlier_version
.
line_number
==
-
1
:
self
.
earlier_version
.
line_number
=
self
.
line_number
self
.
earlier_version
.
init_word
(
page
)
def
join
(
self
,
other_word
,
append_at_end_of_new_word
=
True
):
"""Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
"""
if
append_at_end_of_new_word
:
self
.
text
=
self
.
text
+
other_word
.
text
for
position
in
other_word
.
transkription_positions
:
position
.
id
=
str
(
len
(
self
.
transkription_positions
))
self
.
transkription_positions
.
append
(
position
)
else
:
self
.
text
=
other_word
.
text
+
self
.
text
index
=
0
for
position
in
other_word
.
transkription_positions
:
self
.
transkription_positions
.
insert
(
index
,
position
)
index
+=
1
while
index
<
len
(
self
.
transkription_positions
):
self
.
transkription_positions
[
index
]
.
id
=
str
(
index
)
index
+=
1
self
.
simplify_transkription_positions
()
def
partition_according_to_deletion
(
self
):
"""Partition a word according to its transkription_positions' deletion status
->split word and add partial words as its parts.
"""
if
self
.
has_mixed_status
(
'deleted'
):
transkription_positions
=
[]
last_status
=
None
for
transkription_position
in
self
.
transkription_positions
:
if
transkription_position
.
deleted
!=
last_status
\
and
len
(
transkription_positions
)
>
0
:
newWord
=
Word
(
id
=
len
(
self
.
word_parts
),
line_number
=
self
.
line_number
,
\
transkription_positions
=
transkription_positions
,
deleted
=
last_status
,
writing_process_id
=
self
.
writing_process_id
)
self
.
word_parts
.
append
(
newWord
)
transkription_positions
=
[]
transkription_positions
.
append
(
transkription_position
)
last_status
=
transkription_position
.
deleted
if
len
(
transkription_positions
)
>
0
:
newWord
=
Word
(
id
=
len
(
self
.
word_parts
),
line_number
=
self
.
line_number
,
\
transkription_positions
=
transkription_positions
,
deleted
=
last_status
,
writing_process_id
=
self
.
writing_process_id
)
self
.
word_parts
.
append
(
newWord
)
self
.
transkription_positions
=
[]
self
.
line_number
=
-
1
self
.
deleted
=
False
elif
len
(
self
.
word_parts
)
>
0
:
self
.
word_parts
,
none
=
execute_function_on_parts
(
self
.
word_parts
,
'partition_according_to_deletion'
)
elif
not
self
.
deleted
\
and
len
(
self
.
transkription_positions
)
>
0
\
and
self
.
transkription_positions
[
0
]
.
deleted
:
self
.
deleted
=
True
def
partition_according_to_writing_process_id
(
self
):
"""Partition a word according to its transkription_positions' writing_process_ids
->split word and add partial words as its parts.
"""
if
self
.
belongs_to_multiple_writing_processes
():
last_writing_process_id
=
-
1
transkription_positions
=
[]
for
transkription_position
in
self
.
transkription_positions
:
if
transkription_position
.
writing_process_id
!=
last_writing_process_id
\
and
len
(
transkription_positions
)
>
0
:
newWord
=
Word
(
id
=
len
(
self
.
word_parts
),
line_number
=
self
.
line_number
,
\
transkription_positions
=
transkription_positions
,
writing_process_id
=
last_writing_process_id
)
self
.
word_parts
.
append
(
newWord
)
transkription_positions
=
[]
transkription_positions
.
append
(
transkription_position
)
last_writing_process_id
=
transkription_position
.
writing_process_id
if
len
(
transkription_positions
)
>
0
:
newWord
=
Word
(
id
=
len
(
self
.
word_parts
),
line_number
=
self
.
line_number
,
\
transkription_positions
=
transkription_positions
,
writing_process_id
=
last_writing_process_id
)
self
.
word_parts
.
append
(
newWord
)
self
.
transkription_positions
=
[]
elif
len
(
self
.
word_parts
)
>
0
:
self
.
word_parts
,
none
=
execute_function_on_parts
(
self
.
word_parts
,
'partition_according_to_writing_process_id'
)
if
self
.
belongs_to_multiple_writing_processes
(
include_parts
=
True
):
self
.
writing_process_id
=
sorted
(
set
([
word
.
writing_process_id
for
word
in
self
.
word_parts
]),
reverse
=
True
)[
0
]
elif
len
(
self
.
transkription_positions
)
>
0
:
self
.
writing_process_id
=
self
.
transkription_positions
[
0
]
.
writing_process_id
def
process_boxes
(
self
,
box_paths
,
tr_xmin
=
0.0
,
tr_ymin
=
0.0
,
previous_word_has_box
=
False
):
"""Determines whether word is over a word box.
"""
word_over_box
=
None
if
len
(
self
.
word_parts
)
>
0
:
for
word
in
self
.
word_parts
:
current_word
=
word
.
process_boxes
(
box_paths
,
tr_xmin
=
tr_xmin
,
tr_ymin
=
tr_ymin
,
previous_word_has_box
=
(
word_over_box
is
not
None
))
if
current_word
is
not
None
and
current_word
.
word_box
is
not
None
:
word_over_box
=
current_word
else
:
new_tp_dict
=
{}
for
index
,
transkription_position
in
enumerate
(
self
.
transkription_positions
):
if
previous_word_has_box
and
index
==
0
:
if
len
(
transkription_position
.
positional_word_parts
)
>
0
:
transkription_position
.
positional_word_parts
[
0
]
.
left
+=
transkription_position
.
positional_word_parts
[
0
]
.
width
/
2
#print(f'{self.text}: {transkription_position.positional_word_parts[0].left}')
else
:
transkription_position
.
left
+=
1
word_path
=
Path
.
create_path_from_transkription_position
(
transkription_position
,
\
tr_xmin
=
tr_xmin
,
tr_ymin
=
tr_ymin
)
containing_boxes
=
[
box_path
for
box_path
in
box_paths
\
if
word_path
.
is_partially_contained_by
(
box_path
)
\
or
box_path
.
do_paths_intersect
(
word_path
)
]
if
len
(
containing_boxes
)
>
0
:
if
previous_word_has_box
:
print
(
f
'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}'
)
self
.
_set_box_to_transkription_position
(
containing_boxes
[
0
],
word_path
,
\
transkription_position
,
new_tp_dict
,
tr_xmin
)
box_paths
.
remove
(
containing_boxes
[
0
])
for
replace_tp
in
new_tp_dict
.
keys
():
for
tp
in
new_tp_dict
.
get
(
replace_tp
):
self
.
transkription_positions
.
insert
(
self
.
transkription_positions
.
index
(
replace_tp
),
tp
)
self
.
transkription_positions
.
remove
(
replace_tp
)
word_over_box
=
self
.
_get_partial_word_over_box
()
update_transkription_position_ids
(
self
)
return
word_over_box
def
set_word_insertion_mark
(
self
,
word_insertion_mark
):
"""Sets word_insertion_mark
"""
self
.
word_insertion_mark
=
word_insertion_mark
def
set_writing_process_id_to_transkription_positions
(
self
,
page
):
"""Determines the writing process id of the transkription_positions.
"""
for
transkription_position
in
self
.
transkription_positions
:
if
len
(
transkription_position
.
positional_word_parts
)
>
0
:
for
font_key
in
transkription_position
.
positional_word_parts
[
0
]
.
style_class
.
split
(
' '
):
if
font_key
in
page
.
fontsizekey2stage_mapping
.
keys
():
transkription_position
.
writing_process_id
=
page
.
fontsizekey2stage_mapping
.
get
(
font_key
)
def
simplify_transkription_positions
(
self
):
"""Merge transkription_positions if possible.
"""
index
=
len
(
self
.
transkription_positions
)
-
1
while
index
>
0
\
and
False
not
in
[
'positional_word_parts'
in
tp
.
__dict__
.
keys
()
for
tp
in
self
.
transkription_positions
]:
current_tp
=
self
.
transkription_positions
[
index
]
index
-=
1
previous_tp
=
self
.
transkription_positions
[
index
]
if
previous_tp
.
is_mergebale_with
(
current_tp
):
positional_word_parts
=
previous_tp
.
positional_word_parts
positional_word_parts
+=
current_tp
.
positional_word_parts
transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
\
positional_word_parts
,
debug_msg_string
=
'simplifying transkription positions'
,
transkription_position_id
=
previous_tp
.
id
)
if
len
(
transkription_positions
)
==
1
:
transkription_positions
[
0
]
.
writing_process_id
=
previous_tp
.
writing_process_id
\
if
previous_tp
.
writing_process_id
!=
-
1
\
else
current_tp
.
writing_process_id
self
.
transkription_positions
.
pop
(
index
+
1
)
self
.
transkription_positions
[
index
]
=
transkription_positions
[
0
]
#print(self.text, len(self.transkription_positions))
def
split
(
self
,
split_string
,
start_id
=
0
):
"""Splits the word and returns an 3-tuple of new words.
"""
previousString
,
currentString
,
nextString
=
self
.
text
.
partition
(
split_string
)
currentWord
=
None
previousWord
=
None
nextWord
=
None
previousIndex
=
0
current_id
=
start_id
all_positional_word_parts
=
[]
for
position
in
self
.
transkription_positions
:
all_positional_word_parts
+=
position
.
positional_word_parts
if
len
(
all_positional_word_parts
)
==
0
:
warnings
.
warn
(
'ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'
.
format
(
self
.
id
,
self
.
text
,
previousString
,
currentString
,
nextString
))
if
len
(
previousString
)
>
0
:
previous_pwps
=
[]
while
previousIndex
<
len
(
all_positional_word_parts
)
and
previousString
!=
''
.
join
([
pwp
.
text
for
pwp
in
previous_pwps
]):
previous_pwps
.
append
(
all_positional_word_parts
[
previousIndex
])
previousIndex
+=
1
if
previousString
!=
''
.
join
([
pwp
.
text
for
pwp
in
previous_pwps
]):
warnings
.
warn
(
'ATTENTION: "{}" does not match a word_part_obj!'
.
format
(
previousString
))
else
:
previous_transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
previous_pwps
,
debug_msg_string
=
'word.split'
)
previous_text
=
''
.
join
([
pwp
.
text
for
pwp
in
previous_pwps
])
previousWord
=
Word
(
text
=
previous_text
,
id
=
current_id
,
line_number
=
self
.
line_number
,
transkription_positions
=
previous_transkription_positions
)
current_id
+=
1
all_positional_word_parts
=
all_positional_word_parts
[
previousIndex
:]
if
len
(
nextString
)
>
0
:
tmp_pwps
=
[]
index
=
0
while
index
<
len
(
all_positional_word_parts
)
and
currentString
!=
''
.
join
([
pwp
.
text
for
pwp
in
tmp_pwps
]):
tmp_pwps
.
append
(
all_positional_word_parts
[
index
])
index
+=
1
if
currentString
!=
''
.
join
([
pwp
.
text
for
pwp
in
tmp_pwps
]):
warnings
.
warn
(
'ATTENTION: "{}" does not match a word_part_obj!'
.
format
(
currentString
))
else
:
next_pwps
=
all_positional_word_parts
[
index
:]
next_transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
next_pwps
,
debug_msg_string
=
'word.split'
)
next_text
=
''
.
join
([
pwp
.
text
for
pwp
in
next_pwps
])
nextWord
=
Word
(
text
=
next_text
,
id
=
current_id
+
1
,
line_number
=
self
.
line_number
,
transkription_positions
=
next_transkription_positions
)
all_positional_word_parts
=
all_positional_word_parts
[:
index
]
current_transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
all_positional_word_parts
,
debug_msg_string
=
'word.split'
)
current_text
=
''
.
join
([
pwp
.
text
for
pwp
in
all_positional_word_parts
])
currentWord
=
Word
(
text
=
current_text
,
id
=
current_id
,
line_number
=
self
.
line_number
,
transkription_positions
=
current_transkription_positions
)
return
previousWord
,
currentWord
,
nextWord
def
split_according_to_status
(
self
,
status
,
splits_are_parts
=
False
):
"""Split a word according to its transkription_positions' text.
:return: a list of new word.Word
"""
new_words
=
[]
if
self
.
has_mixed_status
(
status
):
last_status
=
None
transkription_positions
=
[]
for
transkription_position
in
self
.
transkription_positions
:
if
transkription_position
.
__dict__
[
status
]
!=
last_status
\
and
len
(
transkription_positions
)
>
0
:
new_words
.
append
(
\
self
.
_create_new_word
(
transkription_positions
,
status
,
new_id
=
self
.
id
+
len
(
new_words
)))
transkription_positions
=
[]
transkription_positions
.
append
(
transkription_position
)
last_status
=
transkription_position
.
__dict__
[
status
]
if
len
(
transkription_positions
)
>
0
:
new_words
.
append
(
\
self
.
_create_new_word
(
transkription_positions
,
status
,
new_id
=
self
.
id
+
len
(
new_words
)))
if
splits_are_parts
:
self
.
word_parts
+=
new_words
if
len
(
self
.
word_parts
)
>
0
:
self
.
transkription_positions
=
[]
return
new_words
def
undo_partitioning
(
self
):
"""Undo partitioning.
"""
if
len
(
self
.
word_parts
)
>
0
:
for
word_part
in
self
.
word_parts
:
word_part
.
undo_partitioning
()
if
self
.
text
!=
''
.
join
([
tp
.
get_text
()
for
tp
in
self
.
transkription_positions
]):
self
.
transkription_positions
+=
word_part
.
transkription_positions
self
.
earlier_version
=
None
self
.
edited_text
=
None
self
.
word_box
=
None
self
.
word_parts
=
[]
self
.
corrections
=
[]
self
.
earlier_versions
=
[]
self
.
box_paths
=
[]
def
_create_new_word
(
self
,
transkription_positions
,
status
,
new_id
=
0
):
"""Create a new word from self and transkription_positions.
"""
newWord
=
Word
(
id
=
new_id
,
transkription_positions
=
transkription_positions
)
for
key
in
self
.
COPY_PROPERTY_KEY
:
if
key
!=
status
and
key
in
self
.
__dict__
.
keys
():
newWord
.
__dict__
[
key
]
=
self
.
__dict__
[
key
]
if
status
in
self
.
APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS
.
keys
():
newWord
.
__dict__
[
self
.
APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS
[
status
]]
.
append
(
transkription_positions
[
0
]
.
__dict__
[
status
])
else
:
newWord
.
__dict__
[
status
]
=
transkription_positions
[
0
]
.
__dict__
[
status
]
return
newWord
def
_get_parts_with_property_key
(
self
,
property_key
):
"""Return a list of word_parts with property == property_key.
"""
word_parts
=
[]
for
word_part
in
self
.
word_parts
:
if
property_key
in
word_part
.
__dict__
.
keys
():
word_parts
.
append
(
word_part
)
else
:
word_parts
+=
word_part
.
_get_parts_with_property_key
(
property_key
)
return
word_parts
def
_get_partial_word_over_box
(
self
):
"""Partition a word according to its transkription_positions' has_box
->split word and add partial words as its parts.
:return: word over box or self
"""
word_over_box
=
None
if
self
.
has_mixed_status
(
'has_box'
):
transkription_positions
=
[]
last_word_box
=
None
for
transkription_position
in
self
.
transkription_positions
:
if
transkription_position
.
has_box
!=
last_word_box
\
and
len
(
transkription_positions
)
>
0
:
newWord
=
Word
(
id
=
len
(
self
.
word_parts
),
line_number
=
self
.
line_number
,
\
transkription_positions
=
transkription_positions
,
deleted
=
self
.
deleted
,
writing_process_id
=
self
.
writing_process_id
)
self
.
word_parts
.
append
(
newWord
)
if
last_word_box
is
not
None
:
word_over_box
=
newWord
word_over_box
.
word_box
=
last_word_box
transkription_positions
=
[]
transkription_positions
.
append
(
transkription_position
)
last_word_box
=
transkription_position
.
has_box
if
len
(
transkription_positions
)
>
0
:
newWord
=
Word
(
id
=
len
(
self
.
word_parts
),
line_number
=
self
.
line_number
,
\
transkription_positions
=
transkription_positions
,
deleted
=
self
.
deleted
,
writing_process_id
=
self
.
writing_process_id
)
self
.
word_parts
.
append
(
newWord
)
if
last_word_box
is
not
None
:
word_over_box
=
newWord
word_over_box
.
word_box
=
last_word_box
self
.
transkription_positions
=
[]
elif
len
(
self
.
word_parts
)
>
0
:
#self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box')
for
word_part
in
self
.
word_parts
:
if
word_over_box
is
None
:
word_over_box
=
word_part
.
_get_partial_word_over_box
()
else
:
break
elif
len
([
tp
for
tp
in
self
.
transkription_positions
if
tp
.
has_box
is
not
None
])
==
1
:
word_over_box
=
self
word_over_box
.
word_box
=
[
tp
for
tp
in
self
.
transkription_positions
if
tp
.
has_box
is
not
None
][
0
]
.
has_box
return
word_over_box
def
_set_box_to_transkription_position
(
self
,
box_path
,
word_path
,
transkription_position
,
new_transkription_positions_dictionary
,
tr_xmin
):
"""Set box_path to transkription_position that is contained by box_path.
Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary.
"""
if
box_path
.
contains_path
(
word_path
):
transkription_position
.
has_box
=
box_path
elif
box_path
.
contains_start_of_path
(
word_path
):
split_position
=
box_path
.
path
.
bbox
()[
1
]
-
tr_xmin
new_tps
=
transkription_position
.
split
(
split_position
)
if
len
(
new_tps
)
==
2
:
new_tps
[
0
]
.
has_box
=
box_path
new_transkription_positions_dictionary
.
update
({
transkription_position
:
new_tps
})
else
:
transkription_position
.
has_box
=
box_path
elif
box_path
.
contains_end_of_path
(
word_path
):
split_position
=
box_path
.
path
.
bbox
()[
0
]
-
tr_xmin
new_tps
=
transkription_position
.
split
(
split_position
)
if
len
(
new_tps
)
==
2
:
new_tps
[
1
]
.
has_box
=
box_path
new_transkription_positions_dictionary
.
update
({
transkription_position
:
new_tps
})
else
:
transkription_position
.
has_box
=
box_path
else
:
# box_path in the middle of word_pathz
split_position1
=
box_path
.
path
.
bbox
()[
0
]
-
tr_xmin
split_position2
=
box_path
.
path
.
bbox
()[
1
]
-
tr_xmin
new_tps
=
transkription_position
.
split
(
split_position1
,
split_position2
)
if
len
(
new_tps
)
>=
2
:
new_tps
[
1
]
.
has_box
=
box_path
new_transkription_positions_dictionary
.
update
({
transkription_position
:
new_tps
})
else
:
transkription_position
.
has_box
=
box_path
def
do_paths_intersect_saveMode
(
mypath1
,
mypath2
):
"""Returns true if paths intersect, false if not or if there was an exception.
"""
try
:
return
mypath1
.
path
.
intersect
(
mypath2
.
path
,
justonemode
=
True
)
\
or
mypath1
.
is_partially_contained_by
(
mypath2
)
except
AssertionError
:
return
False
Event Timeline
Log In to Comment