Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F58921231
process_footnotes.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Apr 18, 09:40
Size
13 KB
Mime Type
text/x-python
Expires
Sat, Apr 20, 09:40 (2 d)
Engine
blob
Format
Raw Data
Handle
17085027
Attached To
rNIETZSCHEPYTHON nietzsche-python
process_footnotes.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to process words after they have been merged with faksimile data.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
from
colorama
import
Fore
,
Style
import
getopt
import
lxml.etree
as
ET
import
os
from
os
import
listdir
,
sep
,
path
,
setpgrp
,
devnull
from
os.path
import
exists
,
isfile
,
isdir
,
dirname
,
basename
from
pathlib
import
Path
as
PathlibPath
from
progress.bar
import
Bar
import
inspect
import
re
import
shutil
import
sys
import
warnings
if
dirname
(
__file__
)
not
in
sys
.
path
:
sys
.
path
.
append
(
dirname
(
__file__
))
from
datatypes.archival_manuscript
import
ArchivalManuscriptUnity
from
datatypes.page
import
Page
,
STATUS_MERGED_OK
,
STATUS_POSTMERGED_OK
from
datatypes.atypical_writing
import
AtypicalWriting
from
datatypes.clarification
import
Clarification
from
datatypes.editor_comment
import
EditorComment
from
datatypes.editor_correction
import
EditorCorrection
from
datatypes.footnotes
import
extract_footnotes
from
datatypes.imprint
import
extract_imprints
from
datatypes.line_continuation
import
LineContinuation
from
datatypes.standoff_tag
import
StandoffTag
from
datatypes.text
import
Text
from
datatypes.text_connection_mark
import
TextConnectionMark
from
datatypes.uncertain_decipherment
import
UncertainDecipherment
from
util
import
back_up
from
process_files
import
update_svgposfile_status
sys
.
path
.
append
(
'shared_util'
)
from
myxmlwriter
import
write_pretty
,
xml_has_type
,
FILE_TYPE_SVG_WORD_POSITION
,
FILE_TYPE_XML_MANUSCRIPT
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
UNITTESTING
=
False
ATYPICAL_GROUP
=
re
.
compile
(
r'(.*:.*]\s*)(¿)(.*)'
)
CLARIFICATION_GROUP
=
re
.
compile
(
r'(.*:.*]\s*)(Vk)(.*)'
)
CONTINUATION_GROUP
=
re
.
compile
(
r'(.*:\s*)(Fortsetzung\s*)'
)
COMMENT_GROUP
=
re
.
compile
(
r'(.*:.*])'
)
EDITOR_CORRECTION_GROUP
=
re
.
compile
(
r'(.*:.*]\s*)(>[?]*)(.*)'
)
LINE_REFERENCE_GROUP
=
re
.
compile
(
r'(\d+-|\d/(\d+/)*)*([0-9]+)(:.*)'
)
LINE_REFERENCE_GROUP_START_INDEX
=
1
LINE_REFERENCE_GROUP_MID_INDEX
=
2
LINE_REFERENCE_GROUP_END_INDEX
=
3
LINE_COMMENT_GROUP
=
re
.
compile
(
r'(.*\d+:)'
)
UNCERTAINTY_WORD_GROUP
=
re
.
compile
(
r'(.*:.*]\s*)([>]*\?)(.*)'
)
UNCERTAINTY_EDITOR_GROUP
=
re
.
compile
(
r'(.*)(\?)'
)
WORD_REFERENCE_GROUP
=
re
.
compile
(
r'(.*[0-9]+:\s*)(.*)(].*)'
)
DEBUG
=
False
def
categorize_footnotes
(
page
,
footnotes
=
None
,
debug
=
False
,
skip_after
=-
1.0
,
find_content
=
False
):
"""Categorize footnotes.
"""
DEBUG
=
debug
if
footnotes
is
None
:
footnotes
=
extract_footnotes
(
page
,
skip_after
=
skip_after
)
for
footnote
in
footnotes
:
line_match
=
re
.
match
(
LINE_REFERENCE_GROUP
,
footnote
.
content
)
if
line_match
is
not
None
:
_process_line_match
(
page
,
footnote
,
line_match
)
else
:
warnings
.
warn
(
f
'Unknown editor comment without a line reference: <{footnote}>'
)
if
find_content
and
len
(
page
.
text_connection_marks
)
>
0
:
TextConnectionMark
.
find_content_in_footnotes
(
page
,
footnotes
=
footnotes
)
page
.
update_and_attach_words2tree
()
for
line
in
page
.
lines
:
line
.
attach_object_to_tree
(
page
.
page_tree
)
DEBUG
=
False
if
not
UNITTESTING
:
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
page
.
page_tree
.
docinfo
.
URL
,
\
script_name
=
__file__
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
def
save_imprints
(
page
):
"""Categorize footnotes.
"""
for
imprint
in
extract_imprints
(
page
):
imprint
.
attach_object_to_tree
(
page
.
page_tree
)
if
not
UNITTESTING
:
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
page
.
page_tree
.
docinfo
.
URL
,
\
script_name
=
f
'{__file__}:{inspect.currentframe().f_back.f_code.co_name}'
,
file_type
=
FILE_TYPE_SVG_WORD_POSITION
)
def
_is_uncertain
(
footnote
)
->
bool
:
"""Return whether footnote contains sign for uncertainty.
"""
uncertain_match
=
re
.
match
(
UNCERTAINTY_EDITOR_GROUP
,
footnote
.
content
)
return
(
uncertain_match
is
not
None
\
and
len
([
markup
for
markup
in
footnote
.
standoff_markups
\
if
markup
.
css_string
.
endswith
(
'italic;'
)
\
and
uncertain_match
.
end
()
>=
markup
.
startIndex
\
and
uncertain_match
.
end
()
<=
markup
.
endIndex
])
>
0
)
def
_process_line_match
(
page
,
footnote
,
line_match
):
"""Process footnote if reference to a line matches.
"""
word_match
=
re
.
match
(
WORD_REFERENCE_GROUP
,
footnote
.
content
)
end_line_number
=
int
(
line_match
.
group
(
LINE_REFERENCE_GROUP_END_INDEX
))
lines
=
[]
if
line_match
.
group
(
LINE_REFERENCE_GROUP_START_INDEX
)
is
not
None
:
if
line_match
.
group
(
LINE_REFERENCE_GROUP_MID_INDEX
)
is
not
None
:
line_ids
=
[
int
(
line_id
)
for
line_id
in
\
line_match
.
group
(
LINE_REFERENCE_GROUP_START_INDEX
)
.
split
(
'/'
)
\
if
line_id
!=
''
]
+
[
end_line_number
]
lines
=
[
line
for
line
in
page
.
lines
if
line
.
id
in
line_ids
]
else
:
start_line_number
=
int
(
line_match
.
group
(
1
)[
0
:
-
1
])
lines
=
[
line
for
line
in
page
.
lines
if
line
.
id
>=
start_line_number
and
line
.
id
<=
end_line_number
]
else
:
lines
=
[
line
for
line
in
page
.
lines
if
line
.
id
==
end_line_number
]
if
word_match
is
not
None
:
_process_word_match
(
page
.
words
,
footnote
,
line_match
,
word_match
.
group
(
2
),
end_line_number
)
elif
len
(
lines
)
>
0
:
uncertain_match
=
re
.
match
(
UNCERTAINTY_EDITOR_GROUP
,
footnote
.
content
)
for
line
in
lines
:
_process_line_reference
(
page
,
footnote
,
line
,
_is_uncertain
(
footnote
))
else
:
warnings
.
warn
(
f
'Footnote refers to missing line {line_number}: {footnote}'
)
def
_process_line_reference
(
page
,
footnote
,
line
,
is_uncertain
):
"""Process footnote if there is a line reference.
"""
continuation_match
=
re
.
match
(
CONTINUATION_GROUP
,
footnote
.
content
)
if
continuation_match
is
not
None
:
reference_string
=
footnote
.
content
[
continuation_match
.
end
():]
if
is_uncertain
:
reference_string
=
reference_string
[:
-
1
]
line
.
editor_comments
.
append
(
LineContinuation
.
create_cls
(
reference_string
=
reference_string
,
is_uncertain
=
is_uncertain
))
else
:
comment_match
=
re
.
match
(
LINE_COMMENT_GROUP
,
footnote
.
content
)
if
comment_match
is
not
None
:
is_uncertain
=
_is_uncertain
(
footnote
)
comment
=
footnote
.
content
[
comment_match
.
end
():
-
1
]
.
strip
()
\
if
is_uncertain
\
else
footnote
.
content
[
comment_match
.
end
():]
.
strip
()
line
.
editor_comments
.
append
(
EditorComment
(
comment
=
comment
,
is_uncertain
=
is_uncertain
))
else
:
warnings
.
warn
(
f
'Unknown editor comment for line "{line.id}": <{footnote}>'
)
def
_process_word_match
(
words
,
footnote
,
line_match
,
word_text
,
line_number
,
parent_word_composition
=
None
):
"""Process footnote if there is a word reference.
"""
referred_words
=
[
word
for
word
in
words
\
if
word
.
line_number
==
line_number
\
and
(
word
.
text
==
word_text
\
or
re
.
match
(
rf
'\W*{word_text}\W'
,
word
.
text
)
\
or
word
.
edited_text
==
word_text
)
]
referred_word_parts
=
[
word
.
word_parts
for
word
in
words
\
if
word
.
line_number
==
line_number
\
and
len
(
word
.
word_parts
)
>
0
\
and
word_text
in
[
wp
.
text
for
wp
in
word
.
word_parts
]
]
overwritten_word_matches
=
[
word
for
word
in
words
\
if
word
.
line_number
==
line_number
\
and
len
(
word
.
word_parts
)
>
0
\
and
len
([
word_part
for
word_part
in
word
.
word_parts
\
if
word_part
.
overwrites_word
is
not
None
\
and
word_part
.
overwrites_word
.
text
==
word_text
])
>
0
]
if
len
(
referred_words
)
>
0
\
or
len
(
overwritten_word_matches
)
>
0
\
or
len
(
referred_word_parts
)
>
0
:
word
=
None
if
len
(
referred_words
)
==
1
:
word
=
referred_words
[
0
]
elif
len
(
overwritten_word_matches
)
>
0
:
word
=
[
word_part
.
overwrites_word
for
word_part
in
overwritten_word_matches
[
0
]
.
word_parts
\
if
word_part
.
overwrites_word
is
not
None
and
word_part
.
overwrites_word
.
text
==
word_text
][
0
]
elif
len
(
referred_word_parts
)
>
0
:
word
=
[
word_part
for
word_part
in
referred_word_parts
[
0
]
if
word_part
.
text
==
word_text
][
0
]
elif
len
([
better_word
for
better_word
in
referred_words
if
better_word
.
text
==
word_text
])
>
0
:
word
=
[
better_word
for
better_word
in
referred_words
if
better_word
.
text
==
word_text
][
0
]
else
:
word
=
referred_words
[
0
]
atypical_match
=
re
.
match
(
ATYPICAL_GROUP
,
footnote
.
content
)
correction_match
=
re
.
match
(
EDITOR_CORRECTION_GROUP
,
footnote
.
content
)
clarification_match
=
re
.
match
(
CLARIFICATION_GROUP
,
footnote
.
content
)
is_uncertain
=
re
.
match
(
UNCERTAINTY_WORD_GROUP
,
footnote
.
content
)
is
not
None
if
correction_match
is
not
None
:
correction
=
correction_match
.
group
(
3
)
.
strip
()
word
.
editor_comments
.
append
(
EditorCorrection
(
correction_text
=
correction
,
is_uncertain
=
is_uncertain
))
if
not
is_uncertain
:
word
.
edited_text
=
correction
elif
clarification_match
is
not
None
:
word
.
editor_comments
.
append
(
Clarification
(
text
=
footnote
.
extract_part
(
word_text
,
css_filter
=
'bold;'
)))
elif
atypical_match
is
not
None
:
text
=
footnote
.
extract_part
(
word_text
,
css_filter
=
'bold;'
)
\
if
footnote
.
markup_contains_css_filter
(
'bold;'
)
\
else
None
word
.
editor_comments
.
append
(
AtypicalWriting
(
text
=
text
))
elif
is_uncertain
:
word
.
editor_comments
.
append
(
UncertainDecipherment
())
else
:
comment_match
=
re
.
match
(
COMMENT_GROUP
,
footnote
.
content
)
if
comment_match
is
not
None
:
is_uncertain
=
_is_uncertain
(
footnote
)
comment
=
footnote
.
content
[
comment_match
.
end
():
-
1
]
.
strip
()
\
if
is_uncertain
\
else
footnote
.
content
[
comment_match
.
end
():]
.
strip
()
word
.
editor_comments
.
append
(
EditorComment
(
comment
=
comment
,
is_uncertain
=
is_uncertain
))
else
:
warnings
.
warn
(
f
'Unknown editor comment for word "{word.text}": <{footnote}>'
)
elif
re
.
match
(
r'.*\s.*'
,
word_text
):
for
word_part
in
word_text
.
split
(
' '
):
_process_word_match
(
words
,
footnote
,
line_match
,
word_part
,
line_number
,
parent_word_composition
=
word_text
)
elif
len
([
word
for
word
in
words
if
word
.
line_number
==
-
1
and
len
(
word
.
word_parts
)
>
0
])
>
0
:
new_words
=
[]
for
word
in
[
word
for
word
in
words
if
word
.
line_number
==
-
1
and
len
(
word
.
word_parts
)
>
0
]:
new_words
+=
word
.
word_parts
_process_word_match
(
new_words
,
footnote
,
line_match
,
word_text
,
line_number
)
else
:
warnings
.
warn
(
f
'No word found with text "{word_text}" on line {line_number}: <{footnote}>'
)
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to process the footnotes of a page.
svgscripts/process_footnotes.py [OPTIONS] <xmlManuscriptFile|svg_pos_file>
<xmlManuscriptFile> a xml file about a manuscript, containing information about its pages.
<svg_pos_file> a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help show help
-s|--skip-until=left skip all nodes.get('X') < left
:return: exit code (int)
"""
skip_after
=-
1.0
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hs:"
,
[
"help"
,
"skip-until="
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
()
return
0
elif
opt
in
(
'-s'
,
'--skip-until'
):
skip_after
=
float
(
arg
)
if
len
(
args
)
<
1
:
usage
()
return
2
exit_status
=
0
file_a
=
args
[
0
]
if
isfile
(
file_a
):
manuscript_file
=
file_a
\
if
xml_has_type
(
FILE_TYPE_XML_MANUSCRIPT
,
xml_source_file
=
file_a
)
\
else
None
counter
=
0
for
page
in
Page
.
get_pages_from_xml_file
(
file_a
,
status_contains
=
STATUS_MERGED_OK
):
if
not
UNITTESTING
:
print
(
Fore
.
CYAN
+
f
'Processing {page.title}, {page.number} ...'
+
Style
.
RESET_ALL
)
back_up
(
page
,
page
.
xml_file
)
categorize_footnotes
(
page
,
skip_after
=
skip_after
,
find_content
=
True
)
save_imprints
(
page
)
counter
+=
1
not
UNITTESTING
and
print
(
Style
.
RESET_ALL
+
f
'[{counter} pages processed]'
)
else
:
raise
FileNotFoundError
(
'File {} does not exist!'
.
format
(
file_a
))
return
exit_status
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment