Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F88595296
process_words_post_merging.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Oct 19, 16:00
Size
24 KB
Mime Type
text/x-python
Expires
Mon, Oct 21, 16:00 (2 d)
Engine
blob
Format
Raw Data
Handle
21795279
Attached To
rNIETZSCHEPYTHON nietzsche-python
process_words_post_merging.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from
colorama
import
Fore
,
Style
from
deprecated
import
deprecated
from
functools
import
cmp_to_key
import
getopt
import
inspect
import
lxml.etree
as
ET
from
operator
import
attrgetter
import
os
from
os
import
listdir
,
sep
,
path
,
setpgrp
,
devnull
from
os.path
import
exists
,
isfile
,
isdir
,
dirname
,
basename
from
pathlib
import
Path
as
PathlibPath
from
progress.bar
import
Bar
import
re
import
shutil
import
string
from
svgpathtools
import
svg2paths2
,
svg_to_paths
from
svgpathtools.path
import
Path
as
SVGPath
from
svgpathtools.path
import
Line
import
sys
import
tempfile
import
warnings
if
dirname
(
__file__
)
not
in
sys
.
path
:
sys
.
path
.
append
(
dirname
(
__file__
))
from
datatypes.box
import
Box
from
datatypes.manuscript
import
ArchivalManuscriptUnity
from
datatypes.mark_foreign_hands
import
MarkForeignHands
from
datatypes.page
import
Page
,
STATUS_MERGED_OK
,
STATUS_POSTMERGED_OK
from
datatypes.path
import
Path
from
datatypes.text_connection_mark
import
TextConnectionMark
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.word
import
Word
,
do_paths_intersect_saveMode
,
update_transkription_position_ids
from
util
import
back_up
from
process_files
import
update_svgposfile_status
from
process_footnotes
import
categorize_footnotes
sys
.
path
.
append
(
'shared_util'
)
from
myxmlwriter
import
write_pretty
,
xml_has_type
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
UNITTESTING
=
False
DEBUG_WORD
=
None
MERGED_DIR
=
'merged'
def
categorize_paths
(
page
,
transkription_field
=
None
):
"""Categorize all paths that are part of the transkription field.
:return: a dictionary containig a list for each category of path.
"""
if
page
.
source
is
not
None
and
isfile
(
page
.
source
):
MAX_HEIGHT_LINES
=
1
max_line
=
sorted
(
\
[
line_number
.
bottom
-
line_number
.
top
for
line_number
in
page
.
line_numbers
if
line_number
.
id
%
2
==
0
],
\
reverse
=
True
)[
0
]
+
2
if
len
(
page
.
line_numbers
)
>
0
else
17
tr_xmin
=
transkription_field
.
xmin
if
transkription_field
is
not
None
else
0.0
tr_ymin
=
transkription_field
.
ymin
if
transkription_field
is
not
None
else
0.0
paths
,
attributes
=
svg_to_paths
.
svg2paths
(
page
.
source
)
allpaths_on_tf
=
[]
allpaths_outside_tf
=
[]
attributes_outside_tf
=
[]
if
transkription_field
is
None
:
transkription_field
=
TranskriptionField
(
page
.
source
)
for
index
,
path
in
enumerate
(
paths
):
attribute
=
attributes
[
index
]
if
len
(
path
)
>
0
\
and
path
!=
transkription_field
.
path
\
and
path
.
bbox
()[
0
]
>=
tr_xmin
\
and
path
.
bbox
()[
1
]
<=
transkription_field
.
xmax
:
allpaths_on_tf
.
append
(
Path
.
create_cls
(
id
=
index
,
path
=
path
,
style_class
=
attribute
.
get
(
'class'
),
page
=
page
))
elif
len
(
path
)
>
0
\
and
path
!=
transkription_field
.
path
:
allpaths_outside_tf
.
append
(
Path
.
create_cls
(
id
=
index
,
path
=
path
,
style_class
=
attribute
.
get
(
'class'
),
page
=
page
))
#print(index, allpaths_outside_tf[len(allpaths_outside_tf)-1].path, path)
attributes_outside_tf
.
append
(
attribute
)
path_dict
=
{
'text_area_deletion_paths'
:
[],
\
'deletion_or_underline_paths'
:
[],
\
'box_paths'
:
[],
\
'dots_paths'
:
[],
\
'word_connector_paths'
:
[],
\
'uncategorized_paths'
:
[]
}
for
mypath
in
allpaths_on_tf
:
xmin
,
xmax
,
ymin
,
ymax
=
mypath
.
path
.
bbox
()
start_line_number
=
page
.
get_line_number
(
mypath
.
path
.
start
.
imag
-
tr_ymin
)
if
abs
(
xmax
-
xmin
)
<
1
and
abs
(
ymax
-
ymin
)
<
1
:
path_dict
.
get
(
'dots_paths'
)
.
append
(
mypath
)
elif
abs
(
ymax
-
ymin
)
>
MAX_HEIGHT_LINES
and
abs
(
ymax
-
ymin
)
<
max_line
and
mypath
.
path
.
iscontinuous
()
and
mypath
.
path
.
isclosed
():
path_dict
.
get
(
'box_paths'
)
.
append
(
mypath
)
elif
abs
(
ymax
-
ymin
)
>
MAX_HEIGHT_LINES
and
abs
(
ymax
-
ymin
)
>
max_line
and
mypath
.
path
.
iscontinuous
()
and
not
mypath
.
path
.
isclosed
():
path_dict
.
get
(
'word_connector_paths'
)
.
append
(
mypath
)
elif
abs
(
ymax
-
ymin
)
<
MAX_HEIGHT_LINES
:
mypath
.
start_line_number
=
start_line_number
path_dict
.
get
(
'deletion_or_underline_paths'
)
.
append
(
mypath
)
elif
start_line_number
!=
-
1
and
start_line_number
!=
page
.
get_line_number
(
mypath
.
path
.
end
.
imag
-
tr_ymin
):
# Check for "ladder", i.e. a path with 3 segments (seg0 is horizontal on line x, seg1 moves to line x+1, seg2 is horizontal on line x+1)
if
start_line_number
+
1
==
page
.
get_line_number
(
mypath
.
path
.
end
.
imag
-
tr_ymin
)
\
and
len
(
mypath
.
path
.
_segments
)
==
3
\
and
abs
(
mypath
.
path
.
_segments
[
0
]
.
bbox
()[
3
]
-
mypath
.
path
.
_segments
[
0
]
.
bbox
()[
2
])
<
MAX_HEIGHT_LINES
\
and
abs
(
mypath
.
path
.
_segments
[
2
]
.
bbox
()[
3
]
-
mypath
.
path
.
_segments
[
2
]
.
bbox
()[
2
])
<
MAX_HEIGHT_LINES
:
for
index
in
0
,
2
:
new_path
=
Path
(
parent_path
=
mypath
,
path
=
SVGPath
(
mypath
.
path
.
_segments
[
index
]))
new_path
.
start_line_number
=
page
.
get_line_number
(
new_path
.
path
.
start
.
imag
-
tr_ymin
)
path_dict
.
get
(
'deletion_or_underline_paths'
)
.
append
(
new_path
)
else
:
path_dict
.
get
(
'text_area_deletion_paths'
)
.
append
(
mypath
)
else
:
path_dict
.
get
(
'uncategorized_paths'
)
.
append
(
mypath
)
underline_path
=
mark_words_intersecting_with_paths_as_deleted
(
page
,
path_dict
.
get
(
'deletion_or_underline_paths'
),
tr_xmin
,
tr_ymin
)
path_dict
.
update
({
'underline_path'
:
underline_path
})
path_dict
[
'uncategorized_paths'
]
+=
process_word_boxes
(
page
,
path_dict
.
get
(
'box_paths'
),
transkription_field
,
\
paths
=
allpaths_outside_tf
,
attributes
=
attributes_outside_tf
,
max_line
=
max_line
)
return
path_dict
elif
not
UNITTESTING
:
error_msg
=
'Svg source file {} does not exist!'
.
format
(
page
.
source
)
\
if
page
.
source
is
not
None
else
'Page does not contain a source file!'
raise
FileNotFoundError
(
error_msg
)
return
{}
def
copy_page_to_merged_directory
(
page
,
manuscript_file
=
None
):
"""Copy page to directory that contains the first version of all svg_pos_files that have been
merged with the faksimile position data. MERGED_DIR is a subfolder of svg_pos_files-directory.
"""
svg_pos_file
=
PathlibPath
(
page
.
page_tree
.
docinfo
.
URL
)
target_dir
=
svg_pos_file
.
parent
/
MERGED_DIR
if
not
target_dir
.
is_dir
():
target_dir
.
mkdir
()
target_pos_file
=
target_dir
/
svg_pos_file
.
name
save_page
(
page
,
str
(
svg_pos_file
),
target_svg_pos_file
=
str
(
target_pos_file
),
status
=
STATUS_MERGED_OK
,
manuscript_file
=
manuscript_file
)
def
find_special_words
(
page
,
transkription_field
=
None
):
"""Find special words, remove them from words, process their content.
"""
if
page
.
source
is
None
or
not
isfile
(
page
.
source
):
raise
FileNotFoundError
(
'Page does not have a source!'
)
if
transkription_field
is
None
:
transkription_field
=
TranskriptionField
(
page
.
source
)
special_char_list
=
MarkForeignHands
.
get_special_char_list
()
special_char_list
+=
TextConnectionMark
.
get_special_char_list
()
single_char_words
=
[
word
for
word
in
page
.
words
if
len
(
word
.
text
)
==
1
and
word
.
text
in
special_char_list
]
if
not
UNITTESTING
:
bar
=
Bar
(
'find special words'
,
max
=
len
(
single_char_words
))
for
word
in
single_char_words
:
not
bool
(
UNITTESTING
)
and
bar
.
next
()
if
word
.
text
==
MarkForeignHands
.
CLASS_MARK
:
id
=
len
(
page
.
mark_foreign_hands
)
page
.
mark_foreign_hands
.
append
(
MarkForeignHands
.
create_cls_from_word
(
word
,
id
=
id
))
page
.
words
.
remove
(
word
)
elif
word
.
text
in
TextConnectionMark
.
SPECIAL_CHAR_LIST
[
0
]
\
or
(
word
.
text
in
TextConnectionMark
.
SPECIAL_CHAR_LIST
\
and
any
(
style
in
page
.
sonderzeichen_list
for
style
\
in
word
.
transkription_positions
[
0
]
.
positional_word_parts
[
0
]
.
style_class
.
split
(
' '
))):
id
=
len
(
page
.
text_connection_marks
)
page
.
text_connection_marks
.
append
(
TextConnectionMark
.
create_cls_from_word
(
word
,
id
=
id
))
page
.
words
.
remove
(
word
)
not
bool
(
UNITTESTING
)
and
bar
.
finish
()
svg_tree
=
ET
.
parse
(
page
.
source
)
page
.
update_page_type
(
transkription_field
=
transkription_field
)
page
.
update_line_number_area
(
transkription_field
,
svg_tree
=
svg_tree
)
italic_classes
=
[
key
for
key
in
page
.
style_dict
\
if
bool
(
page
.
style_dict
[
key
]
.
get
(
'font-family'
))
and
page
.
style_dict
[
key
][
'font-family'
]
.
endswith
(
'Italic'
)
]
if
len
(
page
.
mark_foreign_hands
)
>
0
:
MarkForeignHands
.
find_content
(
page
.
mark_foreign_hands
,
transkription_field
,
svg_tree
,
italic_classes
=
italic_classes
,
\
SonderzeichenList
=
page
.
sonderzeichen_list
)
if
len
(
page
.
text_connection_marks
)
>
0
:
TextConnectionMark
.
find_content_in_footnotes
(
page
.
text_connection_marks
,
transkription_field
,
svg_tree
,
\
title
=
page
.
title
,
page_number
=
page
.
number
)
def
mark_words_intersecting_with_paths_as_deleted
(
page
,
deletion_paths
,
tr_xmin
=
0.0
,
tr_ymin
=
0.0
):
"""Marks all words that intersect with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] list of .path.Path that might be word_underline_paths
"""
if
not
UNITTESTING
:
bar
=
Bar
(
'mark words that intersect with deletion paths'
,
max
=
len
(
page
.
words
))
for
word
in
page
.
words
:
not
bool
(
UNITTESTING
)
and
bar
.
next
()
word
=
mark_word_if_it_intersects_with_paths_as_deleted
(
word
,
page
,
deletion_paths
,
tr_xmin
=
tr_xmin
,
tr_ymin
=
tr_ymin
)
for
part_word
in
word
.
word_parts
:
part_word
=
mark_word_if_it_intersects_with_paths_as_deleted
(
part_word
,
page
,
deletion_paths
,
tr_xmin
=
tr_xmin
,
tr_ymin
=
tr_ymin
)
word
.
partition_according_to_deletion
()
not
bool
(
UNITTESTING
)
and
bar
.
finish
()
# return those paths in deletion_paths that are not in page.word_deletion_paths
return
[
word_underline_path
for
word_underline_path
in
set
(
deletion_paths
)
-
set
(
page
.
word_deletion_paths
)
]
def
mark_word_if_it_intersects_with_paths_as_deleted
(
word
,
page
,
deletion_paths
,
tr_xmin
=
0.0
,
tr_ymin
=
0.0
):
"""Marks word if it intersects with deletion paths as deleted
and adds these paths to word_deletion_paths.
[:return:] word
"""
word
.
deleted
=
False
for
transkription_position
in
word
.
transkription_positions
:
word_path
=
Path
.
create_path_from_transkription_position
(
transkription_position
,
\
tr_xmin
=
tr_xmin
,
tr_ymin
=
tr_ymin
)
intersecting_paths
=
[
deletion_path
for
deletion_path
in
deletion_paths
\
if
do_paths_intersect_saveMode
(
deletion_path
,
word_path
)
]
if
DEBUG_WORD
is
not
None
and
word
.
text
==
DEBUG_WORD
.
text
and
word
.
line_number
==
DEBUG_WORD
.
line_number
:
relevant_paths
=
[
path
for
path
in
deletion_paths
if
path
.
start_line_number
==
DEBUG_WORD
.
line_number
]
#print(word.line_number, word_path.path.bbox(), [ path.path.bbox() for path in relevant_paths])
if
len
(
intersecting_paths
)
>
0
:
#print(f'{word.line_number}: {word.id}, {word.text}: {intersecting_paths}')
transkription_position
.
deleted
=
True
for
deletion_path
in
intersecting_paths
:
if
deletion_path
.
parent_path
is
not
None
:
deletion_path
=
deletion_path
.
parent_path
if
deletion_path
not
in
page
.
word_deletion_paths
:
deletion_path
.
tag
=
Path
.
WORD_DELETION_PATH_TAG
deletion_path
.
attach_object_to_tree
(
page
.
page_tree
)
page
.
word_deletion_paths
.
append
(
deletion_path
)
return
word
def
post_merging_processing_and_saving
(
svg_pos_file
=
None
,
new_words
=
None
,
page
=
None
,
manuscript_file
=
None
,
target_svg_pos_file
=
None
):
"""Process words after merging with faksimile word positions.
"""
if
page
is
None
and
svg_pos_file
is
None
:
raise
Exception
(
'ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!'
)
if
page
is
None
:
page
=
Page
(
svg_pos_file
)
if
page
.
source
is
None
or
not
isfile
(
page
.
source
):
raise
FileNotFoundError
(
'Page instantiated from {} does not contain an existing source!'
.
format
(
svg_pos_file
))
if
svg_pos_file
is
None
:
svg_pos_file
=
page
.
page_tree
.
docinfo
.
URL
if
new_words
is
not
None
:
page
.
words
=
sorted
(
new_words
,
key
=
attrgetter
(
'id'
))
for
word_node
in
page
.
page_tree
.
xpath
(
'.//word'
):
word_node
.
getparent
()
.
remove
(
word_node
)
manuscript
=
ArchivalManuscriptUnity
.
create_cls
(
manuscript_file
)
\
if
manuscript_file
is
not
None
\
else
None
copy_page_to_merged_directory
(
page
,
manuscript_file
=
manuscript_file
)
transkription_field
=
TranskriptionField
(
page
.
source
)
update_faksimile_line_positions
(
page
)
find_special_words
(
page
,
transkription_field
=
transkription_field
)
#update_writing_process_ids(page)
page
.
update_styles
(
manuscript
=
manuscript
,
partition_according_to_styles
=
True
)
#TODO: find_hyphenated_words(page)
categorize_paths
(
page
,
transkription_field
=
transkription_field
)
categorize_footnotes
(
page
)
save_page
(
page
,
svg_pos_file
,
target_svg_pos_file
=
target_svg_pos_file
,
status
=
STATUS_POSTMERGED_OK
,
manuscript_file
=
manuscript_file
)
def
process_word_boxes
(
page
,
box_paths
,
transkription_field
,
paths
=
None
,
attributes
=
None
,
max_line
=
17
)
->
list
:
"""Process word boxes: partition words according to word boxes.
[:return:] a list of paths that are not boxes
"""
MAX_HEIGHT_LINES
=
1
not_boxes
=
[]
if
not
UNITTESTING
:
bar
=
Bar
(
'process word boxes'
,
max
=
len
(
page
.
words
))
svg_tree
=
ET
.
parse
(
page
.
source
)
namespaces
=
{
k
if
k
is
not
None
else
'ns'
:
v
for
k
,
v
in
svg_tree
.
getroot
()
.
nsmap
.
items
()
}
allpaths_on_margin_field
=
[]
if
paths
is
None
or
attributes
is
None
:
paths
=
[]
raw_paths
,
attributes
=
svg_to_paths
.
svg2paths
(
page
.
source
)
for
index
,
raw_path
in
enumerate
(
raw_paths
):
paths
.
append
(
Path
.
create_cls
(
id
=
index
,
path
=
raw_path
,
style_class
=
attributes
[
index
]
.
get
(
'class'
),
page
=
page
))
for
index
,
mypath
in
enumerate
(
paths
):
path
=
mypath
.
path
xmin
,
xmax
,
ymin
,
ymax
=
path
.
bbox
()
attribute
=
attributes
[
index
]
if
len
(
path
)
>
0
\
and
path
!=
transkription_field
.
path
\
and
((
path
.
bbox
()[
1
]
<
transkription_field
.
xmin
and
transkription_field
.
is_page_verso
())
\
or
(
path
.
bbox
()[
0
]
>
transkription_field
.
xmax
and
not
transkription_field
.
is_page_verso
()))
\
and
abs
(
ymax
-
ymin
)
<
max_line
:
allpaths_on_margin_field
.
append
(
mypath
)
#Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page))
box_line_number_dict
=
{}
for
box_path
in
sorted
(
box_paths
,
key
=
lambda
path
:
path
.
get_median_y
()):
line_number
=
page
.
get_line_number
(
box_path
.
get_median_y
(
tr_ymin
=
transkription_field
.
ymin
))
if
line_number
>
0
:
if
line_number
not
in
box_line_number_dict
.
keys
():
box_line_number_dict
.
update
({
line_number
:
[
box_path
]})
else
:
box_line_number_dict
.
get
(
line_number
)
.
append
(
box_path
)
boxes
=
[]
for
line_number
in
box_line_number_dict
.
keys
():
box_paths_on_line
=
sorted
(
box_line_number_dict
[
line_number
],
key
=
lambda
path
:
path
.
get_x
())
margin_boxes_on_line
=
sorted
([
margin_box
for
margin_box
in
allpaths_on_margin_field
\
if
page
.
get_line_number
(
margin_box
.
get_median_y
(
tr_ymin
=
transkription_field
.
ymin
))
==
line_number
],
\
key
=
lambda
path
:
path
.
get_x
())
threshold
=
3
if
line_number
%
2
==
0
else
1.5
if
len
(
margin_boxes_on_line
)
>
0
:
for
box_path
in
box_paths_on_line
:
#print(line_number, box_path.path.d(), len(margin_boxes_on_line))
box
=
Box
.
create_box
(
box_path
,
margin_boxes_on_line
,
svg_tree
=
svg_tree
,
\
transkription_field
=
transkription_field
,
namespaces
=
namespaces
,
threshold
=
threshold
)
if
box
is
not
None
:
boxes
.
append
(
box
)
else
:
not_boxes
+=
box_paths_on_line
if
len
(
boxes
)
>
0
:
for
word
in
page
.
words
:
word
.
process_boxes
(
boxes
,
tr_xmin
=
transkription_field
.
xmin
,
tr_ymin
=
transkription_field
.
ymin
)
word
.
create_correction_history
(
page
)
if
not
bool
(
UNITTESTING
):
bar
.
next
()
elif
word
.
earlier_version
is
not
None
:
#print(f'{word.text} -> {word.earlier_version.text}')
if
word
.
earlier_version
.
earlier_version
is
not
None
:
print
(
f
'{word.earlier_version.earlier_version.text}'
)
not
bool
(
UNITTESTING
)
and
bar
.
finish
()
return
not_boxes
def
reset_page
(
page
):
"""Reset all words that have word_parts in order to run the script a second time.
"""
svg_pos_file
=
PathlibPath
(
page
.
page_tree
.
docinfo
.
URL
)
first_merge_version
=
svg_pos_file
.
parent
/
MERGED_DIR
/
svg_pos_file
.
name
if
first_merge_version
.
exists
():
page
=
Page
(
str
(
first_merge_version
))
else
:
word_with_wordparts
=
[
word
for
word
in
page
.
words
if
len
(
word
.
word_parts
)
>
0
]
word_with_wordparts
+=
[
word
for
word
in
page
.
words
if
word
.
earlier_version
is
not
None
]
page_changed
=
False
if
len
(
word_with_wordparts
)
>
0
:
for
word
in
word_with_wordparts
:
word
.
undo_partitioning
()
update_transkription_position_ids
(
word
)
page_changed
=
True
no_line_numbers
=
[
word
for
word
in
page
.
words
if
word
.
line_number
==
-
1
]
if
len
(
no_line_numbers
)
>
0
:
for
word
in
no_line_numbers
:
if
len
(
word
.
transkription_positions
)
>
0
:
word
.
line_number
=
page
.
get_line_number
((
word
.
transkription_positions
[
0
]
.
top
+
word
.
transkription_positions
[
0
]
.
bottom
)
/
2
)
else
:
msg
=
f
'Word {word.id} {word.text} has no transkription_position!'
warnings
.
warn
(
msg
)
page_changed
=
True
if
page_changed
:
page
.
update_and_attach_words2tree
()
def
save_page
(
page
,
svg_pos_file
,
target_svg_pos_file
=
None
,
status
=
None
,
manuscript_file
=
None
):
"""Save page to target_file and update status of file.
"""
page
.
update_and_attach_words2tree
()
if
not
UNITTESTING
:
if
target_svg_pos_file
is
None
:
target_svg_pos_file
=
svg_pos_file
if
status
is
not
None
:
update_svgposfile_status
(
svg_pos_file
,
manuscript_file
=
manuscript_file
,
status
=
status
)
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
target_svg_pos_file
,
script_name
=
__file__
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
def
update_faksimile_line_positions
(
page
):
"""Update faksimile_positions of the lines
"""
num_lines
=
len
(
page
.
line_numbers
)
ymin
=
page
.
text_field
.
ymin
\
if
page
.
text_field
is
not
None
\
else
0.0
for
line_number
in
page
.
line_numbers
:
if
len
([
word
.
faksimile_positions
[
0
]
for
word
in
page
.
words
\
if
len
(
word
.
faksimile_positions
)
>
0
and
word
.
line_number
==
line_number
.
id
])
>
0
:
line_number
.
faksimile_inner_top
=
min
([
word
.
faksimile_positions
[
0
]
.
top
for
word
in
page
.
words
\
if
len
(
word
.
faksimile_positions
)
>
0
and
word
.
line_number
==
line_number
.
id
])
line_number
.
faksimile_inner_bottom
=
max
([
word
.
faksimile_positions
[
0
]
.
bottom
for
word
in
page
.
words
\
if
len
(
word
.
faksimile_positions
)
>
0
and
word
.
line_number
==
line_number
.
id
])
if
line_number
.
id
%
2
==
0
:
line_number
.
faksimile_outer_top
=
line_number
.
faksimile_inner_top
-
ymin
line_number
.
faksimile_outer_bottom
=
line_number
.
faksimile_inner_bottom
-
ymin
for
index
,
line_number
in
enumerate
(
page
.
line_numbers
):
if
line_number
.
faksimile_inner_bottom
==
0.0
\
or
line_number
.
faksimile_inner_bottom
<
line_number
.
faksimile_inner_top
:
if
index
==
0
and
num_lines
>
1
:
line_number
.
faksimile_inner_bottom
=
page
.
line_numbers
[
index
+
1
]
.
top
elif
index
==
num_lines
-
1
and
page
.
text_field
is
not
None
:
line_number
.
faksimile_inner_bottom
=
round
(
page
.
text_field
.
height
+
page
.
text_field
.
ymin
,
3
)
elif
index
>
0
and
index
<
num_lines
-
1
:
line_number
.
faksimile_inner_bottom
=
page
.
line_numbers
[
index
+
1
]
.
faksimile_inner_top
\
if
page
.
line_numbers
[
index
+
1
]
.
faksimile_inner_top
>
page
.
line_numbers
[
index
-
1
]
.
faksimile_inner_bottom
\
else
page
.
line_numbers
[
index
-
1
]
.
faksimile_inner_bottom
line_number
.
attach_object_to_tree
(
page
.
page_tree
)
def
update_writing_process_ids
(
page
):
"""Update the writing_process_ids of the words and split accordingly.
"""
for
word
in
page
.
words
:
word
.
set_writing_process_id_to_transkription_positions
(
page
)
word
.
partition_according_to_writing_process_id
()
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to process words after they have been merged with faksimile data.
svgscripts/process_words_post_merging.py [OPTIONS] <xmlManuscriptFile|svg_pos_file>
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svg_pos_file> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-i|--include-missing-line-number run script on files that contain words without line numbers
-r|--rerun rerun script on a svg_pos_file that has already been processed
:return: exit code (int)
"""
status_not_contain
=
STATUS_POSTMERGED_OK
include_missing_line_number
=
False
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hir"
,
[
"help"
,
"include-missing-line-number"
,
"rerun"
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
()
return
0
elif
opt
in
(
'-i'
,
'--include-missing-line-number'
):
include_missing_line_number
=
True
elif
opt
in
(
'-r'
,
'--rerun'
):
status_not_contain
=
''
if
len
(
args
)
<
1
:
usage
()
return
2
exit_status
=
0
file_a
=
args
[
0
]
if
isfile
(
file_a
):
manuscript_file
=
file_a
\
if
xml_has_type
(
FILE_TYPE_XML_MANUSCRIPT
,
xml_source_file
=
file_a
)
\
else
None
counter
=
0
for
page
in
Page
.
get_pages_from_xml_file
(
file_a
,
status_contains
=
STATUS_MERGED_OK
,
status_not_contain
=
status_not_contain
):
reset_page
(
page
)
no_line_numbers
=
[
word
for
word
in
page
.
words
if
word
.
line_number
==
-
1
]
if
not
include_missing_line_number
and
len
(
no_line_numbers
)
>
0
:
not
UNITTESTING
and
print
(
Fore
.
RED
+
f
'Page {page.title}, {page.number} has words with no line number!'
)
for
word
in
no_line_numbers
:
not
UNITTESTING
and
print
(
f
'Word {word.id}: {word.text}'
)
else
:
back_up
(
page
,
page
.
xml_file
)
not
UNITTESTING
and
print
(
Fore
.
CYAN
+
f
'Processing {page.title}, {page.number} ...'
+
Style
.
RESET_ALL
)
post_merging_processing_and_saving
(
page
=
page
,
manuscript_file
=
manuscript_file
)
counter
+=
1
not
UNITTESTING
and
print
(
Style
.
RESET_ALL
+
f
'[{counter} pages processed]'
)
else
:
raise
FileNotFoundError
(
'File {} does not exist!'
.
format
(
file_a
))
return
exit_status
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment