Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F88592049
footnotes.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Oct 19, 15:33
Size
16 KB
Mime Type
text/x-python
Expires
Mon, Oct 21, 15:33 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
21794669
Attached To
rNIETZSCHEPYTHON nietzsche-python
footnotes.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract footnotes from a svg file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import
re
import
sys
from
os
import
listdir
,
sep
,
path
from
os.path
import
isfile
,
isdir
,
dirname
import
lxml.etree
as
ET
import
warnings
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
from
.atypical_writing
import
AtypicalWriting
from
.clarification
import
Clarification
from
.editor_correction
import
EditorCorrection
from
.line_continuation
import
LineContinuation
from
.matrix
import
Matrix
from
.standoff_tag
import
StandoffTag
from
.text
import
Text
from
.transkriptionField
import
TranskriptionField
from
.uncertain_decipherment
import
UncertainDecipherment
UNITTESTING
=
False
DEBUG
=
False
class
FootnoteColumns
:
"""This class represents footnote columns.
"""
REFERENCE_PATTERN
=
re
.
compile
(
'.*(\d+-)*[0-9]+:'
)
EXTENDED_REFERENCE_PATTERN
=
re
.
compile
(
'.*(\d+(-|/))*[0-9]+:'
)
REFERENCE_GROUP
=
re
.
compile
(
'(.*\D)((\d+-)*[0-9]+:)'
)
EXCEPTION
=
re
.
compile
(
'((\d+/)+[0-9]+:)'
)
def
__init__
(
self
,
nsmap
,
nodes
,
bottom_values
,
style_dict
,
debug
=
False
,
skip_after
=-
1.0
):
self
.
bottom_values
=
bottom_values
self
.
footnote_columns
=
[]
self
.
footnote_keys
=
{}
self
.
index
=
0
self
.
nodes
=
nodes
self
.
nsmap
=
nsmap
self
.
skip_after
=
skip_after
self
.
style_dict
=
style_dict
self
.
debug
=
debug
self
.
_init_columns
()
def
_init_columns
(
self
):
"""Initialize footnote column positions
by creating lists in self.footnote_columns and adding the positions a keys
to self.footnote_keys while the index of self.footnote_columns are their values.
"""
first_line_fn_nodes
=
sorted
([
item
for
item
in
self
.
nodes
\
if
round
(
Matrix
(
transform_matrix_string
=
item
.
get
(
'transform'
))
.
getY
(),
1
)
==
round
(
self
.
bottom_values
[
0
],
1
)
\
and
Matrix
(
transform_matrix_string
=
item
.
get
(
'transform'
))
.
getX
()
>
self
.
skip_after
],
\
key
=
lambda
node
:
Matrix
(
transform_matrix_string
=
node
.
get
(
'transform'
))
.
getX
())
current_nodes
=
[]
for
node
in
first_line_fn_nodes
:
matrix
=
Matrix
(
transform_matrix_string
=
node
.
get
(
'transform'
))
if
len
(
node
.
getchildren
())
>
0
:
for
tspan
in
node
.
findall
(
'tspan'
,
self
.
nsmap
):
x
=
matrix
.
add2X
(
float
(
tspan
.
get
(
'x'
)))
current_nodes
.
append
({
'x'
:
x
,
'text'
:
tspan
.
text
})
elif
node
.
text
is
not
None
:
x
=
matrix
.
getX
()
current_nodes
.
append
({
'x'
:
x
,
'text'
:
node
.
text
})
if
re
.
match
(
self
.
EXTENDED_REFERENCE_PATTERN
,
\
''
.
join
([
item
.
get
(
'text'
)
for
item
in
current_nodes
])):
current_nodes
=
self
.
_remove_unused_texts
(
current_nodes
)
self
.
footnote_columns
.
append
([])
self
.
footnote_keys
.
update
({
round
(
current_nodes
[
0
]
.
get
(
'x'
)):
len
(
self
.
footnote_columns
)
-
1
})
current_nodes
=
[]
if
len
(
self
.
footnote_keys
)
==
0
:
raise
Exception
(
f
'ERROR: there are no footnote_keys'
)
def
_remove_unused_texts
(
self
,
nodes
):
"""Remove tspan that contain text that is not a line reference.
"""
threshold
=
100
node_text
=
''
.
join
([
item
.
get
(
'text'
)
for
item
in
nodes
])
match
=
re
.
match
(
self
.
REFERENCE_GROUP
,
node_text
)
if
match
is
not
None
and
match
.
group
(
1
)
is
not
None
\
and
not
re
.
match
(
self
.
EXCEPTION
,
node_text
):
unused_text
=
''
index
=
0
for
item
in
nodes
:
unused_text
+=
item
.
get
(
'text'
)
if
match
.
group
(
1
)
.
startswith
(
unused_text
):
index
+=
1
else
:
break
if
len
(
nodes
)
>
index
+
1
:
counter
=
0
has_gap
=
False
for
item
in
nodes
[
index
:]:
if
len
(
nodes
)
>
index
+
counter
+
1
\
and
nodes
[
index
+
counter
+
1
]
.
get
(
'x'
)
-
nodes
[
index
+
counter
]
.
get
(
'x'
)
>
threshold
:
index
+=
counter
+
1
has_gap
=
True
break
counter
+=
1
if
has_gap
:
return
nodes
[
index
+
1
:]
return
nodes
[
index
:]
return
nodes
def
append
(
self
,
footnote
):
"""Append footnote to a column
"""
self
.
footnote_columns
[
self
.
index
]
.
append
(
footnote
)
@classmethod
def
create_cls
(
cls
,
style_dict
=
None
,
page
=
None
,
transkription_field
=
None
,
svg_tree
=
None
,
svg_file
=
None
,
marginals_on_extra_page
=
False
,
skip_after
=-
1.0
):
"""Returns all footnotes as a list of Text.
"""
if
page
is
not
None
and
page
.
source
is
not
None
and
svg_file
is
None
:
svg_file
=
page
.
source
\
if
page
.
marginals_source
is
None
\
else
page
.
marginals_source
if
transkription_field
is
None
and
svg_file
is
not
None
:
multipage_index
=
-
1
\
if
page
is
None
\
else
page
.
multipage_index
transkription_field
=
TranskriptionField
(
svg_file
,
multipage_index
=
multipage_index
)
if
svg_tree
is
None
and
svg_file
is
not
None
:
svg_tree
=
ET
.
parse
(
svg_file
)
if
style_dict
is
None
and
page
is
not
None
:
style_dict
=
StandoffTag
.
create_relevant_style_dictionary
(
page
)
if
page
is
not
None
and
page
.
marginals_source
is
not
None
:
marginals_on_extra_page
=
True
svg_tree
=
ET
.
parse
(
page
.
marginals_source
)
nodes_in_footnote_area
=
cls
.
EXTRACT_NODES_IN_FOOTNOTE_AREA
(
svg_tree
,
transkription_field
,
marginals_on_extra_page
=
marginals_on_extra_page
)
bottom_values
=
cls
.
GET_UNIQUE_BOTTOM_VALUES
(
nodes_in_footnote_area
)
if
len
(
bottom_values
)
==
0
:
return
None
else
:
return
cls
(
svg_tree
.
getroot
()
.
nsmap
,
nodes_in_footnote_area
,
bottom_values
,
style_dict
,
skip_after
=
skip_after
)
def
extract_footnotes
(
self
,
contains_string
=
''
,
contains_strings
=
None
)
->
list
:
"""Returns all footnotes as a list of Text.
"""
left_value
=
-
1
for
bottom_value
in
self
.
bottom_values
:
nodes_on_line
=
sorted
([
item
for
item
in
self
.
nodes
\
if
round
(
Matrix
(
transform_matrix_string
=
item
.
get
(
'transform'
))
.
getY
(),
1
)
==
bottom_value
\
and
Matrix
(
transform_matrix_string
=
item
.
get
(
'transform'
))
.
getX
()
>
self
.
skip_after
\
],
\
key
=
lambda
x
:
Matrix
(
transform_matrix_string
=
x
.
get
(
'transform'
))
.
getX
())
footnote
=
None
matrix
=
None
for
node
in
nodes_on_line
:
matrix
=
Matrix
(
transform_matrix_string
=
node
.
get
(
'transform'
))
footnote
,
left_value
=
self
.
_process_content_and_markup
(
node
,
footnote
,
matrix
)
if
footnote
is
not
None
:
self
.
append
(
footnote
)
footnotes
=
self
.
toList
()
if
contains_strings
is
not
None
:
footnotes
=
[
footnote
for
footnote
in
footnotes
if
True
in
[
contains_string
in
footnote
.
content
for
contains_string
in
contains_strings
]
]
if
contains_string
!=
''
:
footnotes
=
[
footnote
for
footnote
in
footnotes
if
contains_string
in
footnote
.
content
]
return
footnotes
def
get_index
(
self
,
left_value
)
->
int
:
"""Return index of column for left value.
"""
index
=
-
1
if
round
(
left_value
)
in
self
.
footnote_keys
.
keys
():
index
=
self
.
footnote_keys
[
round
(
left_value
)]
else
:
for
key
,
value
in
self
.
footnote_keys
.
items
():
if
abs
(
key
-
round
(
left_value
))
<
2
:
index
=
value
break
return
index
def
register_index
(
self
,
left_value
):
"""Register index for next column to be used.
"""
index
=
self
.
get_index
(
left_value
)
if
index
>
-
1
:
self
.
index
=
index
else
:
error_value
=
round
(
left_value
)
msg
=
f
'Left value not part of columns: {error_value} -> {self.footnote_keys}'
raise
Exception
(
msg
)
def
toList
(
self
):
"""Return footnotes as a list of Text.
"""
footnotes
=
[]
for
footnote_list
in
self
.
footnote_columns
:
for
footnote
in
footnote_list
:
if
re
.
match
(
self
.
REFERENCE_PATTERN
,
footnote
.
content
):
footnotes
.
append
(
footnote
)
elif
len
(
footnotes
)
>
0
:
footnotes
[
-
1
]
.
join
(
footnote
)
else
:
print
([
footnote
.
content
for
footnote
in
self
.
footnote_columns
[
1
]])
print
(
self
.
footnote_keys
)
raise
Exception
(
f
'List of footnotes empty and footnote "{footnote.content}" does not match {self.REFERENCE_PATTERN.pattern}!'
)
return
footnotes
def
_process_content_and_markup
(
self
,
node
,
footnote
,
matrix
):
"""Process content and markup of node.
[:return:] (footnote: Text, left_value: float)
"""
startIndex
=
0
next_text
=
node
.
text
left_value
=
matrix
.
getX
()
items
=
[
item
for
item
in
node
.
findall
(
'tspan'
,
self
.
nsmap
)]
if
len
(
items
)
>
0
:
next_text
=
''
.
join
([
item
.
text
for
item
in
items
])
left_value
=
matrix
.
add2X
(
float
(
items
[
0
]
.
get
(
'x'
)))
elif
bool
(
node
.
get
(
'x'
)):
left_value
=
matrix
.
add2X
(
float
(
node
.
get
(
'x'
)))
if
footnote
!=
None
and
\
((
re
.
match
(
r'.*[0-9]+:'
,
next_text
)
\
and
re
.
match
(
r'.*[0-9]+:'
,
footnote
.
content
)
\
and
not
re
.
match
(
r'.*\d-'
,
footnote
.
content
))
\
or
(
self
.
get_index
(
left_value
)
>
-
1
\
and
self
.
get_index
(
left_value
)
!=
self
.
index
)):
if
DEBUG
and
re
.
match
(
r'.*[0-9]+:'
,
next_text
)
\
and
not
re
.
match
(
r'.*[0-9]+:'
,
footnote
.
content
):
print
(
footnote
,
next_text
)
self
.
append
(
footnote
)
footnote
=
None
if
len
(
items
)
>
0
:
for
item
in
items
:
footnote
,
left_value
=
self
.
_process_content_and_markup
(
item
,
footnote
,
matrix
)
else
:
if
footnote
is
None
:
footnote
=
Text
(
content
=
next_text
)
try
:
self
.
register_index
(
left_value
)
except
Exception
:
print
(
self
.
footnote_columns
)
raise
Exception
(
f
'{footnote}'
)
else
:
startIndex
=
footnote
.
append
(
next_text
)
if
bool
(
node
.
get
(
'class'
)):
standoff_markups
=
StandoffTag
.
create_cls
(
startIndex
,
len
(
footnote
.
content
)
-
1
,
node
.
get
(
'class'
),
style_dict
=
self
.
style_dict
)
if
len
(
standoff_markups
)
>
0
:
if
len
(
footnote
.
standoff_markups
)
>
0
:
standoff_markups
=
footnote
.
standoff_markups
[
-
1
]
.
join_list
(
standoff_markups
)
if
len
(
standoff_markups
)
>
0
:
footnote
.
standoff_markups
+=
standoff_markups
return
footnote
,
left_value
@staticmethod
def
EXTRACT_NODES_IN_FOOTNOTE_AREA
(
svg_tree
,
transkription_field
=
None
,
marginals_on_extra_page
=
False
)
->
list
:
"""Return a list of nodes that are in footnote area.
"""
if
transkription_field
is
None
and
svg_tree
is
not
None
:
transkription_field
=
TranskriptionField
(
svg_tree
.
docinfo
.
URL
)
nodes_in_footnote_area
=
[
item
for
item
in
filter
(
lambda
node
:
Matrix
.
NODE_HAS_CONTENT_IN_FOOTNOTE_AREA
(
node
,
transkription_field
,
\
marginals_on_extra_page
=
marginals_on_extra_page
),
\
svg_tree
.
getroot
()
.
iterfind
(
'.//text'
,
svg_tree
.
getroot
()
.
nsmap
))]
for
node
in
nodes_in_footnote_area
:
if
not
Matrix
.
IS_IN_FOOTNOTE_AREA
(
node
.
get
(
'transform'
),
transkription_field
,
marginals_on_extra_page
=
marginals_on_extra_page
):
for
child
in
node
.
getchildren
():
if
not
Matrix
.
IS_IN_FOOTNOTE_AREA
(
node
.
get
(
'transform'
),
transkription_field
,
x
=
float
(
child
.
get
(
'x'
)),
marginals_on_extra_page
=
marginals_on_extra_page
):
node
.
remove
(
child
)
return
nodes_in_footnote_area
@staticmethod
def
GET_UNIQUE_BOTTOM_VALUES
(
nodes_in_footnote_area
)
->
list
:
"""Return sorted list of unique bottom values.
"""
return
sorted
([
bottom_value
for
bottom_value
in
set
(
round
(
Matrix
(
transform_matrix_string
=
item
.
get
(
'transform'
))
.
getY
(),
1
)
for
item
in
nodes_in_footnote_area
)
])
def
extract_footnotes_as_strings
(
transkription_field
=
None
,
svg_tree
=
None
,
svg_file
=
None
,
contains_string
=
''
,
marginals_extra
=
False
):
"""Returns all footnotes as a list of strings.
"""
if
transkription_field
is
None
and
svg_file
is
not
None
:
transkription_field
=
TranskriptionField
(
svg_file
)
if
svg_tree
is
None
and
svg_file
is
not
None
:
svg_tree
=
ET
.
parse
(
svg_file
)
footnotes
=
[]
nodes_in_footnote_area
=
[
item
for
item
in
filter
(
lambda
x
:
Matrix
.
IS_IN_FOOTNOTE_AREA
(
x
.
get
(
'transform'
),
transkription_field
,
marginals_on_extra_page
=
marginals_extra
),
\
svg_tree
.
getroot
()
.
iterfind
(
'.//text'
,
svg_tree
.
getroot
()
.
nsmap
))]
bottom_values
=
sorted
([
bottom_value
for
bottom_value
in
set
(
Matrix
(
transform_matrix_string
=
item
.
get
(
'transform'
))
.
getY
()
for
item
in
nodes_in_footnote_area
)
])
for
bottom_value
in
bottom_values
:
nodes_on_line
=
[
item
for
item
in
nodes_in_footnote_area
if
Matrix
(
transform_matrix_string
=
item
.
get
(
'transform'
))
.
getY
()
==
bottom_value
]
nodes_on_line
=
sorted
(
nodes_on_line
,
key
=
lambda
x
:
Matrix
(
transform_matrix_string
=
x
.
get
(
'transform'
))
.
getX
())
footnote_string
=
''
for
node
in
nodes_on_line
:
if
len
(
node
.
getchildren
())
==
0
:
if
footnote_string
!=
''
and
re
.
match
(
r'.*[0-9]+:'
,
node
.
text
):
footnotes
.
append
(
footnote_string
)
footnote_string
=
node
.
text
else
:
footnote_string
+=
node
.
text
else
:
next_string
=
''
.
join
([
item
.
text
for
item
in
node
.
findall
(
'tspan'
,
svg_tree
.
getroot
()
.
nsmap
)])
if
footnote_string
!=
''
and
re
.
match
(
r'.*[0-9]+:'
,
next_string
):
footnotes
.
append
(
footnote_string
)
footnote_string
=
next_string
else
:
footnote_string
+=
next_string
footnotes
.
append
(
footnote_string
)
if
contains_string
!=
''
:
footnotes
=
[
footnote_string
for
footnote_string
in
footnotes
if
contains_string
in
footnote_string
]
return
footnotes
def
extract_footnotes
(
page
,
transkription_field
=
None
,
svg_tree
=
None
,
svg_file
=
None
,
contains_string
=
''
,
contains_strings
=
None
,
skip_after
=-
1.0
)
->
list
:
"""Returns all footnotes as a list of Text.
"""
marginals_on_extra_page
=
False
if
page
.
marginals_source
is
not
None
:
marginals_on_extra_page
=
True
svg_tree
=
ET
.
parse
(
page
.
marginals_source
)
if
transkription_field
is
None
:
transkription_field
=
TranskriptionField
(
page
.
source
)
footnote_columns
=
FootnoteColumns
.
create_cls
(
page
=
page
,
transkription_field
=
transkription_field
,
\
svg_tree
=
svg_tree
,
svg_file
=
svg_file
,
marginals_on_extra_page
=
marginals_on_extra_page
,
skip_after
=
skip_after
)
if
footnote_columns
is
None
:
return
[]
return
footnote_columns
.
extract_footnotes
(
contains_string
=
contains_string
,
contains_strings
=
contains_strings
)
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment