Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F61600618
fix_missing_glyphs.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, May 7, 18:06
Size
8 KB
Mime Type
text/x-python
Expires
Thu, May 9, 18:06 (2 d)
Engine
blob
Format
Raw Data
Handle
17533247
Attached To
rNIETZSCHEPYTHON nietzsche-python
fix_missing_glyphs.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to fix missing glyphs.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from
colorama
import
Fore
,
Style
import
getopt
import
re
import
sys
from
os
import
listdir
,
sep
,
path
from
os.path
import
isfile
,
isdir
,
dirname
import
lxml.etree
as
ET
if
dirname
(
__file__
)
not
in
sys
.
path
:
sys
.
path
.
append
(
dirname
(
__file__
))
from
datatypes.page
import
Page
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
from
datatypes.positional_word_part
import
PositionalWordPart
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.transkription_position
import
TranskriptionPosition
from
process_files
import
update_svgposfile_status
sys
.
path
.
append
(
'shared_util'
)
from
myxmlwriter
import
write_pretty
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
UNITTESTING
=
False
def
find_missing_glyph_for_pwp
(
positional_word_part_node
,
svg_path_tree
,
namespaces
,
xmin
=
0.0
,
ymin
=
0.0
):
"""Finds missing glyph for node of a PositionalWordPart.
:return: list of PositionalWordPart
"""
THRESHOLD
=
15.5
pwp
=
PositionalWordPart
(
node
=
positional_word_part_node
)
word_part_obj
=
{
"x"
:
pwp
.
left
,
"y"
:
pwp
.
top
,
"text"
:
pwp
.
text
,
"matrix"
:
pwp
.
transform
,
"class"
:
pwp
.
style_class
}
start_id
=
int
(
pwp
.
id
)
threshold
=
-
0.5
positional_word_parts
=
[]
while
threshold
<
THRESHOLD
and
len
(
positional_word_parts
)
<
1
:
try
:
positional_word_parts
=
PositionalWordPart
.
CREATE_POSITIONAL_WORD_PART_LIST
(
word_part_obj
,
svg_path_tree
,
namespaces
,
\
start_id
=
start_id
,
xmin
=
xmin
,
ymin
=
ymin
,
threshold
=
threshold
,
throw_error_if_not_found
=
True
)
except
Exception
:
threshold
+=
0.1
return
positional_word_parts
def
update_word
(
page
,
positional_word_part_node
,
positional_word_parts
):
"""Updates word according to new positional_word_parts.
"""
if
len
(
positional_word_parts
)
>
0
:
debug_msg_string
=
'update word from '
+
__file__
positional_word_part_id
=
int
(
positional_word_part_node
.
get
(
'id'
))
transkription_position_id
=
int
(
positional_word_part_node
.
getparent
()
.
get
(
'id'
))
word_id
=
int
(
positional_word_part_node
.
getparent
()
.
getparent
()
.
get
(
'id'
))
word
=
page
.
words
[
word_id
]
transkription_position
=
word
.
transkription_positions
[
transkription_position_id
]
transkription_position
.
positional_word_parts
.
pop
(
positional_word_part_id
)
positional_word_parts
.
reverse
()
for
positional_word_part
in
positional_word_parts
:
transkription_position
.
positional_word_parts
.
insert
(
positional_word_part_id
,
positional_word_part
)
for
index
,
positional_word_part
in
enumerate
(
transkription_position
.
positional_word_parts
):
positional_word_part
.
id
=
index
transkription_positions
=
TranskriptionPosition
.
CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS
(
\
transkription_position
.
positional_word_parts
,
debug_msg_string
=
debug_msg_string
,
transkription_position_id
=
transkription_position_id
)
word
.
transkription_positions
.
pop
(
transkription_position_id
)
transkription_positions
.
reverse
()
for
new_tp
in
transkription_positions
:
word
.
transkription_positions
.
insert
(
transkription_position_id
,
new_tp
)
text
=
''
for
index
,
tp
in
enumerate
(
word
.
transkription_positions
):
tp
.
id
=
index
tp
.
writing_process_id
=
transkription_position
.
writing_process_id
for
pwp
in
tp
.
positional_word_parts
:
text
+=
pwp
.
text
if
word
.
text
!=
text
:
word
.
text
=
text
word
.
attach_word_to_tree
(
page
.
page_tree
)
def
fix_missing_glyphs
(
svg_word_pos_file
,
manuscript_file
=
None
):
"""Finds missing glyphs for xml file of type FILE_TYPE_SVG_WORD_POSITION.
"""
if
isfile
(
svg_word_pos_file
):
if
not
UNITTESTING
:
print
(
Fore
.
LIGHTBLUE_EX
+
'Fixing missing glyphs for file {} ... '
.
format
(
svg_word_pos_file
),
end
=
''
)
#print(Style.RESET_ALL)
page
=
Page
(
xml_source_file
=
svg_word_pos_file
)
transkription_field
=
TranskriptionField
(
page
.
svg_file
)
svg_path_tree
=
ET
.
parse
(
page
.
svg_file
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_path_tree
.
getroot
()
.
nsmap
.
items
()
}
number_of_missing_glyphs
=
len
(
page
.
page_tree
.
xpath
(
'//'
+
PositionalWordPart
.
XML_TAG
+
'[not(@symbol-id)]'
))
for
positional_word_part_node
in
page
.
page_tree
.
xpath
(
'//'
+
PositionalWordPart
.
XML_TAG
+
'[not(@symbol-id)]'
):
pwps
=
find_missing_glyph_for_pwp
(
positional_word_part_node
,
svg_path_tree
,
namespaces
,
xmin
=
transkription_field
.
xmin
,
ymin
=
transkription_field
.
ymin
)
update_word
(
page
,
positional_word_part_node
,
pwps
)
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
svg_word_pos_file
,
script_name
=
__file__
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
page
=
Page
(
xml_source_file
=
svg_word_pos_file
)
new_number_of_missing_glyphs
=
len
(
page
.
page_tree
.
xpath
(
'//'
+
PositionalWordPart
.
XML_TAG
+
'[not(@symbol-id)]'
))
if
not
UNITTESTING
:
result_color
=
Fore
.
LIGHTBLUE_EX
if
new_number_of_missing_glyphs
==
0
else
Fore
.
MAGENTA
print
(
result_color
+
' {0}/{1}'
.
format
(
number_of_missing_glyphs
-
new_number_of_missing_glyphs
,
number_of_missing_glyphs
),
end
=
''
)
print
(
Fore
.
LIGHTBLUE_EX
+
' fixed.'
,
end
=
''
)
print
(
Style
.
RESET_ALL
)
if
len
(
page
.
page_tree
.
xpath
(
'//'
+
PositionalWordPart
.
XML_TAG
+
'[not(@symbol-id)]'
))
==
0
:
update_svgposfile_status
(
svg_word_pos_file
,
manuscript_file
=
manuscript_file
,
status
=
'OK'
)
def
get_filelist_and_manuscript_file
(
file_a
,
file_b
=
None
):
"""Returns a file list and a manuscript file (or None)
"""
file_list
=
[]
manuscript_file
=
None
source_tree
=
ET
.
parse
(
file_a
)
if
source_tree
.
getroot
()
.
find
(
'metadata/type'
)
.
text
==
FILE_TYPE_SVG_WORD_POSITION
\
and
len
([
word_part
for
word_part
in
source_tree
.
xpath
(
'//'
+
PositionalWordPart
.
XML_TAG
+
'[not(@symbol-id)]'
)])
>
0
:
# if symbol_ids are missing ...
file_list
.
append
(
file_a
)
if
file_b
is
not
None
:
manuscript_file
=
file_b
elif
source_tree
.
getroot
()
.
find
(
'metadata/type'
)
.
text
==
FILE_TYPE_XML_MANUSCRIPT
:
manuscript_file
=
file_a
if
file_b
is
not
None
:
file_list
.
append
(
file_b
)
else
:
file_list
=
source_tree
.
xpath
(
'//page[contains(@status, "{}")]/@output'
.
format
(
PositionalWordPart
.
WARN_NO_USE_NODE_FOUND
.
lower
()))
return
file_list
,
manuscript_file
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to fix missing glyphs.
svgscripts/fix_missing_glyphs.py [OPTIONS] <xmlManuscriptFile|svgWordPosition>-File [<xmlManuscriptFile|svgWordPosition>-File]
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svgWordPosition> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help: show help
:return: exit code (int)
"""
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"h"
,
[
"help"
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
()
return
0
if
len
(
args
)
<
1
:
usage
()
return
2
exit_status
=
0
file_a
=
args
[
0
]
if
isfile
(
file_a
):
file_b
=
None
if
len
(
args
)
>
1
and
isfile
(
args
[
1
]):
file_b
=
args
[
1
]
file_list
,
manuscript_file
=
get_filelist_and_manuscript_file
(
file_a
,
file_b
=
file_b
)
for
svg_word_pos_file
in
file_list
:
fix_missing_glyphs
(
svg_word_pos_file
,
manuscript_file
=
manuscript_file
)
else
:
raise
FileNotFoundError
(
'File {} does not exist!'
.
format
(
file_a
))
return
exit_status
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment