Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F62826909
word.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, May 15, 22:34
Size
11 KB
Mime Type
text/x-python
Expires
Fri, May 17, 22:34 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
17694804
Attached To
rNIETZSCHEPYTHON nietzsche-python
word.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a word.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
from
lxml
import
etree
as
ET
import
warnings
from
.class_spec
import
SemanticClass
from
.matrix
import
Matrix
from
.word_position
import
WordPosition
from
.transkription_position
import
TranskriptionPosition
class
Word
(
SemanticClass
):
"""
This class represents a word.
"""
DATA
=
'debug-data'
def
__init__
(
self
,
id
=
0
,
text
=
''
,
line_number
=-
1
,
transkription_positions
=
[],
faksimile_positions
=
[],
word_part_objs
=
[]):
self
.
id
=
id
self
.
text
=
text
self
.
line_number
=
line_number
self
.
transkription_positions
=
transkription_positions
self
.
faksimile_positions
=
faksimile_positions
self
.
word_part_objs
=
word_part_objs
self
.
is_head_of_inserted_words
=
False
self
.
is_tail_of_inserted_words
=
False
self
.
is_before_inserted_words
=
False
self
.
is_after_inserted_words
=
False
self
.
word_insertion_mark
=
None
self
.
debug_msg
=
None
@classmethod
def
get_semantic_dictionary
(
cls
):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary
=
{}
class_dict
=
cls
.
get_class_dictionary
()
properties
=
{
'id'
:
(
int
,
1
,
'word/@id'
),
'text'
:
(
str
,
1
,
'word/@text'
),
'line_number'
:
(
int
,
1
,
'word/@line-number'
),
\
'transkription_positions'
:
(
TranskriptionPosition
,
SemanticClass
.
LIST
,
'word/@id'
),
\
'faksimile_positions'
:
(
WordPosition
,
SemanticClass
.
LIST
,
'word/@id'
)}
dictionary
.
update
({
'class'
:
class_dict
})
dictionary
.
update
({
'properties'
:
properties
})
return
dictionary
def
set_word_insertion_mark
(
self
,
word_insertion_mark
):
"""Sets word_insertion_mark
"""
self
.
word_insertion_mark
=
word_insertion_mark
def
attach_word_to_tree
(
self
,
target_tree
):
"""Attaches word to tree target_tree.
"""
word_node
=
target_tree
.
getroot
()
.
xpath
(
'//word[@id="
%s
"]'
%
self
.
id
)[
0
]
\
if
(
len
(
target_tree
.
getroot
()
.
xpath
(
'//word[@id="
%s
"]'
%
self
.
id
))
>
0
)
\
else
ET
.
SubElement
(
target_tree
.
getroot
(),
'word'
,
attrib
=
{
'id'
:
str
(
self
.
id
)})
word_node
.
set
(
'text'
,
self
.
text
)
if
self
.
line_number
>
-
1
:
word_node
.
set
(
'line-number'
,
str
(
self
.
line_number
))
for
transkription_position
in
self
.
transkription_positions
:
transkription_position
.
attach_object_to_tree
(
word_node
)
"""
data_node = word_node.find(self.DATA) if bool(word_node.find(self.DATA)) else ET.SubElement(word_node, self.DATA)
for part_index, word_part in enumerate(self.word_part_objs):
part_node = data_node.xpath('./part[@index="%s"]' % part_index)[0] \
if(len(data_node.xpath('./part[@index="%s"]' % part_index)) > 0) \
else ET.SubElement(data_node, 'part', attrib={'index': str(part_index)})
part_node.set('text', word_part['text'])
part_node.set('class', word_part['class'])
part_node.set('x', str(round(float(word_part['x']), 3)))
part_node.set('y', str(round(float(word_part['y']), 3)))
if self.debug_msg is not None:
ET.SubElement(data_node, 'end', attrib={'debug-msg': self.debug_msg})
"""
def
split
(
self
,
page
,
split_string
,
start_id
=
0
):
"""Splits the word and returns an 3-tuple of new words.
"""
previousString
,
currentString
,
nextString
=
self
.
text
.
partition
(
split_string
)
currentWord
=
None
previousWord
=
None
nextWord
=
None
previousIndex
=
0
current_id
=
start_id
all_positional_word_parts
=
[]
for
position
in
self
.
transkription_positions
:
all_positional_word_parts
+=
position
.
positional_word_parts
if
len
(
all_positional_word_parts
)
==
0
:
warnings
.
warn
(
'ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'
.
format
(
self
.
id
,
self
.
text
,
previousString
,
currentString
,
nextString
))
if
len
(
previousString
)
>
0
:
previous_pwps
=
[]
while
previousIndex
<
len
(
all_positional_word_parts
)
and
previousString
!=
''
.
join
([
pwp
.
text
for
pwp
in
previous_pwps
]):
previous_pwps
.
append
(
all_positional_word_parts
[
previousIndex
])
previousIndex
+=
1
if
previousString
!=
''
.
join
([
pwp
.
text
for
pwp
in
previous_pwps
]):
warnings
.
warn
(
'ATTENTION: "{}" does not match a word_part_obj!'
.
format
(
previousString
))
else
:
previous_transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
page
,
previous_pwps
,
debug_msg_string
=
'word.split'
)
previous_text
=
''
.
join
([
pwp
.
text
for
pwp
in
previous_pwps
])
previousWord
=
Word
(
text
=
previous_text
,
id
=
current_id
,
line_number
=
self
.
line_number
,
transkription_positions
=
previous_transkription_positions
)
current_id
+=
1
all_positional_word_parts
=
all_positional_word_parts
[
previousIndex
:]
if
len
(
nextString
)
>
0
:
tmp_pwps
=
[]
index
=
0
while
index
<
len
(
all_positional_word_parts
)
and
currentString
!=
''
.
join
([
pwp
.
text
for
pwp
in
tmp_pwps
]):
tmp_pwps
.
append
(
all_positional_word_parts
[
index
])
index
+=
1
if
currentString
!=
''
.
join
([
pwp
.
text
for
pwp
in
tmp_pwps
]):
warnings
.
warn
(
'ATTENTION: "{}" does not match a word_part_obj!'
.
format
(
currentString
))
else
:
next_pwps
=
all_positional_word_parts
[
index
:]
next_transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
page
,
next_pwps
,
debug_msg_string
=
'word.split'
)
next_text
=
''
.
join
([
pwp
.
text
for
pwp
in
next_pwps
])
nextWord
=
Word
(
text
=
next_text
,
id
=
current_id
+
1
,
line_number
=
self
.
line_number
,
transkription_positions
=
next_transkription_positions
)
all_positional_word_parts
=
all_positional_word_parts
[:
index
]
current_transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
page
,
all_positional_word_parts
,
debug_msg_string
=
'word.split'
)
current_text
=
''
.
join
([
pwp
.
text
for
pwp
in
all_positional_word_parts
])
currentWord
=
Word
(
text
=
current_text
,
id
=
current_id
,
line_number
=
self
.
line_number
,
transkription_positions
=
current_transkription_positions
)
return
previousWord
,
currentWord
,
nextWord
def
join
(
self
,
other_word
,
append_at_end_of_new_word
=
True
):
"""Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions.
"""
if
append_at_end_of_new_word
:
self
.
text
=
self
.
text
+
other_word
.
text
for
position
in
other_word
.
transkription_positions
:
position
.
id
=
str
(
len
(
self
.
transkription_positions
))
self
.
transkription_positions
.
append
(
position
)
else
:
self
.
text
=
other_word
.
text
+
self
.
text
index
=
0
for
position
in
other_word
.
transkription_positions
:
self
.
transkription_positions
.
insert
(
index
,
position
)
index
+=
1
while
index
<
len
(
self
.
transkription_positions
):
self
.
transkription_positions
[
index
]
.
id
=
str
(
index
)
index
+=
1
@staticmethod
def
CREATE_WORD
(
word_node
=
None
,
page
=
None
,
word_part_objs
=
[],
id
=
0
,
height
=
0
,
endX
=
0
,
endSign
=
None
,
matrix
=
None
,
line_number
=-
1
,
debug_msg
=
None
):
"""Creates a word from a (lxml.Element) node or word_part_objs.
[:return:] Word
"""
if
word_node
is
not
None
:
# init word from xml node
id
=
int
(
word_node
.
get
(
'id'
))
line_number
=
int
(
word_node
.
get
(
'line-number'
))
if
bool
(
word_node
.
get
(
'line-number'
))
else
line_number
text
=
word_node
.
get
(
'text'
)
transkription_positions
=
[
TranskriptionPosition
(
node
=
node
)
for
node
in
word_node
.
findall
(
'.//'
+
WordPosition
.
TRANSKRIPTION
)
]
faksimile_positions
=
[
WordPosition
(
node
=
node
)
for
node
in
word_node
.
findall
(
'.//'
+
WordPosition
.
FAKSIMILE
)
]
word_part_objs
=
[
item
.
attrib
for
item
in
word_node
.
findall
(
'.//'
+
Word
.
DATA
+
'/part'
)]
\
if
len
(
word_node
.
findall
(
'.//'
+
Word
.
DATA
))
>
0
\
else
[
item
.
attrib
for
item
in
word_node
.
findall
(
'.//part'
)]
return
Word
(
id
=
id
,
text
=
text
,
line_number
=
line_number
,
transkription_positions
=
transkription_positions
,
\
faksimile_positions
=
faksimile_positions
,
word_part_objs
=
word_part_objs
)
elif
len
(
word_part_objs
)
>
0
:
# init word from word_part_obj that has been extracted from svg file
WIDTH
=
5
TOPCORRECTION
=
2.0
FONTWIDTHFACTOR
=
0.7
# factor that multiplies lastCharFontSize
height
=
height
x
=
round
(
float
(
word_part_objs
[
0
][
'x'
]),
3
)
if
(
page
is
not
None
and
bool
(
page
.
style_dict
)):
HEIGHT_FACTOR
=
1.1
# factor that multiplies biggest_font_size -> height
style_set
=
set
(
' '
.
join
(
set
(
dict
[
'class'
]
for
dict
in
word_part_objs
))
.
split
(
' '
))
biggest_font_size
=
page
.
get_biggest_fontSize4styles
(
style_set
=
style_set
)
height
=
round
(
biggest_font_size
*
HEIGHT_FACTOR
+
HEIGHT_FACTOR
/
biggest_font_size
,
3
)
TOPCORRECTION
=
1
+
HEIGHT_FACTOR
/
biggest_font_size
if
endSign
is
not
None
and
'%'
in
endSign
:
lastCharFontSizeList
=
[
float
(
page
.
style_dict
[
key
][
'font-size'
]
.
replace
(
'px'
,
''
))
\
for
key
in
word_part_objs
[
len
(
word_part_objs
)
-
1
][
'class'
]
.
split
(
' '
)
\
if
bool
(
page
.
style_dict
[
key
]
.
get
(
'font-size'
))]
lastCharFontSize
=
lastCharFontSizeList
[
0
]
if
len
(
lastCharFontSizeList
)
>
0
else
1
endX
=
float
(
endX
)
+
lastCharFontSize
*
FONTWIDTHFACTOR
elif
endSign
is
not
None
and
'%'
in
endSign
:
endX
=
float
(
endX
)
+
WIDTH
bottom
=
round
(
float
(
word_part_objs
[
0
][
'y'
]),
3
)
y
=
round
(
bottom
-
height
+
TOPCORRECTION
,
3
)
width
=
round
(
float
(
endX
)
-
x
,
3
)
transkription_positions
=
[
WordPosition
(
height
=
height
,
width
=
width
,
x
=
x
,
y
=
y
,
matrix
=
matrix
,
tag
=
WordPosition
.
TRANSKRIPTION
)
]
text
=
''
.
join
([
dict
[
'text'
]
for
dict
in
word_part_objs
])
line_number
=
page
.
get_line_number
(
(
y
+
bottom
)
/
2
)
if
page
is
not
None
else
line_number
word
=
Word
(
id
=
id
,
text
=
text
,
line_number
=
line_number
,
transkription_positions
=
transkription_positions
,
word_part_objs
=
word_part_objs
)
word
.
debug_msg
=
debug_msg
return
word
else
:
error_msg
=
'word_node has not been defined'
if
(
word_node
is
None
)
else
'word_part_objs is empty'
raise
Exception
(
'Error: {}'
.
format
(
error_msg
))
Event Timeline
Log In to Comment