Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F64796670
positional_word_part.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, May 29, 13:02
Size
8 KB
Mime Type
text/x-python
Expires
Fri, May 31, 13:02 (2 d)
Engine
blob
Format
Raw Data
Handle
17960101
Attached To
rNIETZSCHEPYTHON nietzsche-python
positional_word_part.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a positional word part, i.e. part of a word that has a position on the transkription.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
from
lxml
import
etree
as
ET
from
svgpathtools.parser
import
parse_path
import
warnings
from
.positional_object
import
PositionalObject
class
PositionalWordPart
(
PositionalObject
):
"""
This class represents a positional word part, i.e. a part of a word that has a position on the transkription.
Args:
id (int): object id
text (str): text
symbol_id (str): id of corresponding symbol
style_class (str) style class id
matrix (datatypes.Matrix): matrix containing information about conversion.
height (float): height of
width (float): width of object
x (float): x position of object
y (float): y position of object
"""
XML_TAG
=
'word-part'
extraStringKeys
=
[
'text'
,
'symbol_id'
,
'style_class'
]
def
__init__
(
self
,
node
=
None
,
id
=
0
,
height
=
0.0
,
width
=
0.0
,
x
=
0.0
,
y
=
0.0
,
matrix
=
None
,
text
=
None
,
symbol_id
=
None
,
style_class
=
None
):
super
(
PositionalWordPart
,
self
)
.
__init__
(
id
=
id
,
node
=
node
,
height
=
height
,
width
=
width
,
x
=
x
,
y
=
y
,
matrix
=
matrix
,
tag
=
PositionalWordPart
.
XML_TAG
)
self
.
stringKeys
+=
[
'text'
,
'symbol_id'
,
'style_class'
]
self
.
text
=
text
self
.
symbol_id
=
symbol_id
self
.
style_class
=
style_class
if
node
is
not
None
:
self
.
text
=
node
.
get
(
'text'
)
self
.
symbol_id
=
node
.
get
(
'symbol-id'
)
self
.
style_class
=
node
.
get
(
'style-class'
)
@classmethod
def
get_semantic_dictionary
(
cls
):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary
=
super
(
cls
,
cls
)
.
get_semantic_dictionary
()
dictionary
[
'properties'
]
.
update
(
dict
(
zip
(
cls
.
extraStringKeys
,
[
(
str
,
1
)
for
i
in
cls
.
extraStringKeys
])))
return
dictionary
@staticmethod
def
CREATE_POSITIONAL_WORD_PART
(
text
,
use_node
,
namespaces
,
start_id
=
0
,
xmin
=
0.0
,
ymin
=
0.0
,
matrix
=
None
,
style_class
=
None
):
"""Creates a PositionalWordPart.
[:return:] a PositionalWordPart
"""
symbol_id
=
use_node
.
get
(
'{
%s
}href'
%
namespaces
[
'xlink'
])
.
replace
(
'#'
,
''
)
x
=
float
(
use_node
.
get
(
'x'
))
-
xmin
if
bool
(
use_node
.
get
(
'x'
))
else
0.0
y
=
float
(
use_node
.
get
(
'y'
))
-
ymin
if
bool
(
use_node
.
get
(
'y'
))
else
0.0
d_strings
=
use_node
.
xpath
(
'//ns:symbol[@id="{0}"]/ns:path/@d'
.
format
(
symbol_id
),
namespaces
=
namespaces
)
if
len
(
d_strings
)
>
0
:
path
=
parse_path
(
d_strings
[
0
])
xmin
,
xmax
,
ymin
,
ymax
=
path
.
bbox
()
width
=
xmax
-
xmin
height
=
ymax
-
ymin
return
PositionalWordPart
(
id
=
start_id
,
text
=
text
,
height
=
height
,
width
=
width
,
x
=
x
,
y
=
y
-
height
,
\
matrix
=
matrix
,
symbol_id
=
symbol_id
,
style_class
=
style_class
)
else
:
return
PositionalWordPart
(
id
=
start_id
,
text
=
text
,
x
=
x
,
y
=
y
,
matrix
=
matrix
,
symbol_id
=
symbol_id
,
style_class
=
style_class
)
@staticmethod
def
CREATE_POSITIONAL_WORD_PART_LIST
(
word_part_obj
,
svg_path_tree
,
namespaces
,
page
,
start_id
=
0
,
xmin
=
0.0
,
ymin
=
0.0
):
"""Creates a list of PositionalWordPart from a word_part_obj (a dictionary with the keys: text, x, y, matrix, class),
using a (lxml.ElementTree) svg_path_tree and the corresponding namespaces.
[:return:] a list of PositionalWordPart
"""
THRESHOLD
=
0.4
word_part_list
=
[]
x
=
float
(
word_part_obj
[
'x'
])
if
bool
(
word_part_obj
.
get
(
'x'
))
else
0.0
y
=
float
(
word_part_obj
[
'y'
])
if
bool
(
word_part_obj
.
get
(
'y'
))
else
0.0
text
=
word_part_obj
.
get
(
'text'
)
matrix
=
word_part_obj
.
get
(
'matrix'
)
style_class
=
word_part_obj
.
get
(
'class'
)
if
text
is
not
None
and
text
!=
''
:
svg_x
=
x
+
xmin
svg_y
=
y
+
ymin
use_nodes
=
svg_path_tree
.
xpath
(
'//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'
\
.
format
(
svg_x
-
THRESHOLD
,
svg_x
+
THRESHOLD
,
svg_y
-
THRESHOLD
,
svg_y
+
THRESHOLD
),
namespaces
=
namespaces
)
if
len
(
use_nodes
)
>
0
:
current_use_node
=
use_nodes
[
0
]
index
=
0
word_part_list
.
append
(
PositionalWordPart
.
CREATE_POSITIONAL_WORD_PART
(
text
[
index
],
current_use_node
,
namespaces
,
\
start_id
=
start_id
,
xmin
=
xmin
,
ymin
=
ymin
,
matrix
=
matrix
,
style_class
=
style_class
))
index
,
start_id
=
index
+
1
,
start_id
+
1
while
index
<
len
(
text
)
and
current_use_node
.
getnext
()
is
not
None
:
current_use_node
=
current_use_node
.
getnext
()
word_part_list
.
append
(
PositionalWordPart
.
CREATE_POSITIONAL_WORD_PART
(
text
[
index
],
current_use_node
,
namespaces
,
\
start_id
=
start_id
,
xmin
=
xmin
,
ymin
=
ymin
,
matrix
=
matrix
,
style_class
=
style_class
))
index
,
start_id
=
index
+
1
,
start_id
+
1
if
index
<
len
(
text
)
and
current_use_node
.
getnext
()
is
None
:
last_pwp
=
word_part_list
[
len
(
word_part_list
)
-
1
]
word_part_obj
[
'x'
]
=
last_pwp
.
left
+
last_pwp
.
width
+
0.5
word_part_obj
[
'y'
]
=
last_pwp
.
bottom
word_part_obj
[
'text'
]
=
word_part_obj
[
'text'
][
index
:]
word_part_list
+=
PositionalWordPart
.
CREATE_POSITIONAL_WORD_PART_LIST
(
word_part_obj
,
\
svg_path_tree
,
namespaces
,
page
,
start_id
=
start_id
,
xmin
=
xmin
,
ymin
=
ymin
)
return
word_part_list
else
:
warnings
.
warn
(
'No use_node found for text {} svg_x {}, svg_y {}'
.
format
(
text
,
svg_x
,
svg_y
))
return
PositionalWordPart
.
CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST
(
page
,
[
word_part_obj
])
#[ PositionalWordPart(id=start_id, text=text, x=x, y=y, matrix=matrix, style_class=style_class) ]
else
:
return
[
]
@staticmethod
def
CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST
(
page
,
word_part_objs
):
"""Creates a list of PositionalWordPart from word_part_objs (i.e. a list of dictionaries
with the keys: text, x, y, matrix, class).
[:return:] a list of (datatypes.positional_word_part) PositionalWordPart
"""
positional_word_parts
=
[]
HEIGHT_FACTOR
=
1.1
# factor that multiplies font_size -> height
FONTWIDTHFACTOR
=
0.7
# factor that multiplies lastCharFontSize
SPACING
=
0.2
for
index
,
part_obj
in
enumerate
(
word_part_objs
):
text
=
part_obj
.
get
(
'text'
)
matrix
=
part_obj
.
get
(
'matrix'
)
style_class
=
part_obj
.
get
(
'class'
)
x
=
float
(
part_obj
[
'x'
])
if
bool
(
part_obj
.
get
(
'x'
))
else
0.0
y
=
float
(
part_obj
[
'y'
])
if
bool
(
part_obj
.
get
(
'y'
))
else
0.0
font_size
=
page
.
get_biggest_fontSize4styles
(
style_set
=
set
(
style_class
.
split
(
' '
)))
height
=
round
(
font_size
*
HEIGHT_FACTOR
+
HEIGHT_FACTOR
/
font_size
,
3
)
width
=
round
(
font_size
*
FONTWIDTHFACTOR
,
3
)
if
index
+
1
<
len
(
word_part_objs
)
and
bool
(
word_part_objs
[
index
+
1
]
.
get
(
'x'
)):
width
=
float
(
word_part_objs
[
index
+
1
][
'x'
])
-
x
-
SPACING
positional_word_parts
.
append
(
PositionalWordPart
(
id
=
index
,
text
=
text
,
height
=
height
,
width
=
width
,
x
=
x
,
y
=
y
,
matrix
=
matrix
,
style_class
=
style_class
))
return
positional_word_parts
Event Timeline
Log In to Comment