Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91235956
fix_boxes.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Nov 9, 05:57
Size
8 KB
Mime Type
text/x-python
Expires
Mon, Nov 11, 05:57 (2 d)
Engine
blob
Format
Raw Data
Handle
22178075
Attached To
rNIETZSCHEPYTHON nietzsche-python
fix_boxes.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from
colorama
import
Fore
,
Style
from
deprecated
import
deprecated
from
functools
import
cmp_to_key
import
getopt
import
inspect
import
lxml.etree
as
ET
import
re
import
shutil
import
string
from
svgpathtools
import
svg2paths2
,
svg_to_paths
from
svgpathtools.path
import
Path
as
SVGPath
from
svgpathtools.path
import
Line
import
sys
import
tempfile
from
operator
import
attrgetter
import
os
from
os
import
listdir
,
sep
,
path
,
setpgrp
,
devnull
from
os.path
import
exists
,
isfile
,
isdir
,
dirname
,
basename
from
progress.bar
import
Bar
import
warnings
from
fix_old_data
import
save_page
sys
.
path
.
append
(
'svgscripts'
)
from
convert_wordPositions
import
HTMLConverter
from
datatypes.box
import
Box
from
datatypes.faksimile
import
FaksimilePage
from
datatypes.archival_manuscript
import
ArchivalManuscriptUnity
from
datatypes.mark_foreign_hands
import
MarkForeignHands
from
datatypes.page
import
Page
,
STATUS_MERGED_OK
,
STATUS_POSTMERGED_OK
from
datatypes.path
import
Path
from
datatypes.text_connection_mark
import
TextConnectionMark
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.transkription_position
import
TranskriptionPosition
from
datatypes.word
import
Word
,
update_transkription_position_ids
from
join_faksimileAndTranskription
import
sort_words
from
util
import
back_up
,
back_up_svg_file
,
copy_faksimile_svg_file
from
process_files
import
update_svgposfile_status
from
process_words_post_merging
import
update_faksimile_line_positions
,
MERGED_DIR
sys
.
path
.
append
(
'shared_util'
)
from
myxmlwriter
import
write_pretty
,
xml_has_type
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
from
main_util
import
create_function_dictionary
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
UNITTESTING
=
False
MAX_SVG_XY_THRESHOLD
=
10
BOX_ERROR_STATUS
=
'box error'
DEBUG_MSG
=
'TODO: should have a box'
class
WordWithBoxes
(
Word
):
@classmethod
def
create_cls
(
cls
,
word_node
):
"""Creates a word from a (lxml.Element) node.
[:return:] WordWithBoxes
"""
word
=
super
(
WordWithBoxes
,
cls
)
.
create_cls
(
word_node
)
word
.
missing_boxes
=
[]
for
index
,
debug_node
in
enumerate
(
word_node
.
xpath
(
'./debug'
)):
missing_text
=
debug_node
.
get
(
'text'
)
is_earlier_version
=
bool
(
debug_node
.
get
(
'earlier-version'
))
\
and
debug_node
.
get
(
'earlier-version'
)
==
'true'
overwritten_by
=
debug_node
.
get
(
'overwritten-by'
)
if
overwritten_by
is
not
None
:
split_into_parts_and_attach_box
(
word
,
index
,
missing_text
,
is_earlier_version
,
overwritten_by
)
else
:
attach_box
(
word
,
0
,
missing_text
,
False
)
word
.
create_correction_history
()
if
len
(
word
.
corrections
)
>
0
:
for
wp
in
word
.
word_parts
:
wp
.
overwrites_word
=
None
return
word
def
attach_box
(
target_word
,
box_index
,
earlier_text
,
is_earlier_version
):
"""Attach box to word.
"""
transkription_position
=
target_word
.
transkription_positions
[
0
]
if
len
(
target_word
.
transkription_positions
)
>
1
:
positional_word_parts
=
[]
for
tp
in
target_word
.
transkription_positions
:
positional_word_parts
+=
tp
.
positional_word_parts
transkription_position
=
TranskriptionPosition
(
positional_word_parts
=
positional_word_parts
)
target_word
.
word_box
=
Box
(
id
=
box_index
,
path
=
Path
.
create_path_from_transkription_position
(
transkription_position
)
.
path
,
\
earlier_text
=
earlier_text
,
earlier_version
=
is_earlier_version
)
def
split_into_parts_and_attach_box
(
target_word
,
box_index
,
missing_text
,
is_earlier_version
,
overwritten_by
,
child_process
=
False
)
->
list
:
"""Split word into word parts and attach a box to the part with text == overwritten_by.
"""
if
len
(
target_word
.
word_parts
)
>
0
:
index
=
0
if
True
in
[
wp
.
word_box
is
not
None
for
wp
in
target_word
.
word_parts
]:
latest_word_with_box
=
[
wp
for
wp
in
target_word
.
word_parts
if
wp
.
word_box
is
not
None
][
-
1
]
index
=
target_word
.
word_parts
.
index
(
latest_word_with_box
)
+
1
child_word_parts
=
[]
for
wp
in
target_word
.
word_parts
[
index
:]:
word_parts
=
split_into_parts_and_attach_box
(
wp
,
box_index
,
missing_text
,
is_earlier_version
,
overwritten_by
,
child_process
=
True
)
if
child_process
:
child_word_parts
+=
word_parts
elif
len
(
word_parts
)
>
0
:
old_index
=
target_word
.
word_parts
.
index
(
wp
)
target_word
.
word_parts
[
old_index
]
=
word_parts
[
0
]
for
new_wp
in
word_parts
[
1
:]:
target_word
.
word_parts
.
insert
(
old_index
+
1
,
new_wp
)
if
overwritten_by
in
[
new_wp
.
text
for
new_wp
in
word_parts
]:
break
if
child_process
:
return
child_word_parts
return
target_word
.
word_parts
elif
overwritten_by
in
target_word
.
text
:
new_words_triple
=
target_word
.
split
(
overwritten_by
)
word_with_box
=
[
wp
for
wp
in
new_words_triple
if
wp
is
not
None
and
wp
.
text
==
overwritten_by
][
0
]
attach_box
(
word_with_box
,
box_index
,
missing_text
,
is_earlier_version
)
if
not
child_process
:
if
len
(
new_words_triple
)
>
1
:
target_word
.
word_parts
=
[
i
for
i
in
new_words_triple
if
i
is
not
None
]
target_word
.
transkription_positions
=
[]
else
:
target_word
.
word_box
=
word_with_box
.
word_box
return
[
i
for
i
in
new_words_triple
if
i
is
not
None
]
return
[]
def
fix_boxes
(
page
)
->
int
:
"""Fix boxes and return exit code
"""
exit_status
=
0
for
word_node
in
set
([
node
.
getparent
()
for
node
in
page
.
page_tree
.
xpath
(
'//'
+
Word
.
XML_TAG
+
f
'/debug[@msg="{DEBUG_MSG}"]'
)]):
word
=
WordWithBoxes
.
create_cls
(
word_node
)
try
:
replace_word
=
[
w
for
w
in
page
.
words
if
w
.
id
==
word
.
id
and
w
.
text
==
word
.
text
][
0
]
page
.
words
[
page
.
words
.
index
(
replace_word
)]
=
word
except
IndexError
:
return
2
if
not
UNITTESTING
:
save_page
(
page
,
attach_first
=
True
)
return
exit_status
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to fix boxes.
svgscripts/fix_boxes.py [OPTIONS] <xmlManuscriptFile|svg_pos_file>
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svg_pos_file> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
:return: exit code (int)
"""
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"h"
,
[
"help"
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
()
return
0
if
len
(
args
)
<
1
:
usage
()
return
2
exit_status
=
0
xml_file
=
args
[
0
]
if
isfile
(
xml_file
):
counter
=
0
for
page
in
Page
.
get_pages_from_xml_file
(
xml_file
,
status_contains
=
BOX_ERROR_STATUS
):
counter
=
0
if
not
UNITTESTING
:
print
(
Fore
.
CYAN
+
f
'Fixing boxes of {page.title}, {page.number} ...'
+
Style
.
RESET_ALL
)
back_up
(
page
,
page
.
xml_file
)
if
fix_boxes
(
page
)
==
0
:
counter
+=
1
if
not
UNITTESTING
:
print
(
Style
.
RESET_ALL
+
f
'[{counter} pages changed]'
)
else
:
raise
FileNotFoundError
(
'File {} does not exist!'
.
format
(
xml_file
))
return
exit_status
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment