Page MenuHomec4science

footnotes.py
No OneTemporary

File Metadata

Created
Sat, Oct 19, 15:33

footnotes.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract footnotes from a svg file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
import warnings
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from .atypical_writing import AtypicalWriting
from .clarification import Clarification
from .editor_correction import EditorCorrection
from .line_continuation import LineContinuation
from .matrix import Matrix
from .standoff_tag import StandoffTag
from .text import Text
from .transkriptionField import TranskriptionField
from .uncertain_decipherment import UncertainDecipherment
UNITTESTING = False
DEBUG = False
class FootnoteColumns:
"""This class represents footnote columns.
"""
REFERENCE_PATTERN = re.compile('.*(\d+-)*[0-9]+:')
EXTENDED_REFERENCE_PATTERN = re.compile('.*(\d+(-|/))*[0-9]+:')
REFERENCE_GROUP = re.compile('(.*\D)((\d+-)*[0-9]+:)')
EXCEPTION = re.compile('((\d+/)+[0-9]+:)')
def __init__(self, nsmap, nodes, bottom_values, style_dict, debug=False, skip_after=-1.0):
self.bottom_values = bottom_values
self.footnote_columns = []
self.footnote_keys = {}
self.index = 0
self.nodes = nodes
self.nsmap = nsmap
self.skip_after = skip_after
self.style_dict = style_dict
self.debug = debug
self._init_columns()
def _init_columns(self):
"""Initialize footnote column positions
by creating lists in self.footnote_columns and adding the positions a keys
to self.footnote_keys while the index of self.footnote_columns are their values.
"""
first_line_fn_nodes = sorted([ item for item in self.nodes\
if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == round(self.bottom_values[0], 1)\
and Matrix(transform_matrix_string=item.get('transform')).getX() > self.skip_after],\
key=lambda node: Matrix(transform_matrix_string=node.get('transform')).getX())
current_nodes = []
for node in first_line_fn_nodes:
matrix = Matrix(transform_matrix_string=node.get('transform'))
if len(node.getchildren()) > 0:
for tspan in node.findall('tspan', self.nsmap):
x = matrix.add2X(float(tspan.get('x')))
current_nodes.append({ 'x': x, 'text': tspan.text })
elif node.text is not None:
x = matrix.getX()
current_nodes.append({ 'x': x, 'text': node.text })
if re.match(self.EXTENDED_REFERENCE_PATTERN,\
''.join([ item.get('text') for item in current_nodes])):
current_nodes = self._remove_unused_texts(current_nodes)
self.footnote_columns.append([])
self.footnote_keys.update({ round(current_nodes[0].get('x')): len(self.footnote_columns)-1 })
current_nodes = []
if len(self.footnote_keys) == 0:
raise Exception(f'ERROR: there are no footnote_keys')
def _remove_unused_texts(self, nodes):
"""Remove tspan that contain text that is not a line reference.
"""
threshold = 100
node_text = ''.join([ item.get('text') for item in nodes])
match = re.match(self.REFERENCE_GROUP, node_text)
if match is not None and match.group(1) is not None\
and not re.match(self.EXCEPTION, node_text):
unused_text = ''
index = 0
for item in nodes:
unused_text += item.get('text')
if match.group(1).startswith(unused_text):
index += 1
else:
break
if len(nodes) > index+1:
counter = 0
has_gap = False
for item in nodes[index:]:
if len(nodes) > index+counter+1\
and nodes[index+counter+1].get('x')-nodes[index+counter].get('x') > threshold:
index += counter+1
has_gap = True
break
counter += 1
if has_gap:
return nodes[index+1:]
return nodes[index:]
return nodes
def append(self, footnote):
"""Append footnote to a column
"""
self.footnote_columns[self.index].append(footnote)
@classmethod
def create_cls(cls, style_dict=None, page=None, transkription_field=None, svg_tree=None, svg_file=None, marginals_on_extra_page=False, skip_after=-1.0):
"""Returns all footnotes as a list of Text.
"""
if page is not None and page.source is not None and svg_file is None:
svg_file = page.source\
if page.marginals_source is None\
else page.marginals_source
if transkription_field is None and svg_file is not None:
multipage_index = -1\
if page is None\
else page.multipage_index
transkription_field = TranskriptionField(svg_file, multipage_index=multipage_index)
if svg_tree is None and svg_file is not None:
svg_tree = ET.parse(svg_file)
if style_dict is None and page is not None:
style_dict = StandoffTag.create_relevant_style_dictionary(page)
if page is not None and page.marginals_source is not None:
marginals_on_extra_page = True
svg_tree = ET.parse(page.marginals_source)
nodes_in_footnote_area = cls.EXTRACT_NODES_IN_FOOTNOTE_AREA(svg_tree, transkription_field, marginals_on_extra_page=marginals_on_extra_page)
bottom_values = cls.GET_UNIQUE_BOTTOM_VALUES(nodes_in_footnote_area)
if len(bottom_values) == 0:
return None
else:
return cls(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, style_dict, skip_after=skip_after)
def extract_footnotes(self, contains_string='', contains_strings=None) -> list:
"""Returns all footnotes as a list of Text.
"""
left_value = -1
for bottom_value in self.bottom_values:
nodes_on_line = sorted([ item for item in self.nodes\
if round(Matrix(transform_matrix_string=item.get('transform')).getY(), 1) == bottom_value\
and Matrix(transform_matrix_string=item.get('transform')).getX() > self.skip_after\
],\
key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX())
footnote = None
matrix = None
for node in nodes_on_line:
matrix = Matrix(transform_matrix_string=node.get('transform'))
footnote, left_value = self._process_content_and_markup(node, footnote, matrix)
if footnote is not None:
self.append(footnote)
footnotes = self.toList()
if contains_strings is not None:
footnotes = [ footnote for footnote in footnotes if True in [ contains_string in footnote.content for contains_string in contains_strings] ]
if contains_string != '':
footnotes = [ footnote for footnote in footnotes if contains_string in footnote.content ]
return footnotes
def get_index(self, left_value) -> int:
"""Return index of column for left value.
"""
index = -1
if round(left_value) in self.footnote_keys.keys():
index = self.footnote_keys[round(left_value)]
else:
for key, value in self.footnote_keys.items():
if abs(key - round(left_value)) < 2:
index = value
break
return index
def register_index(self, left_value):
"""Register index for next column to be used.
"""
index = self.get_index(left_value)
if index > -1:
self.index = index
else:
error_value = round(left_value)
msg = f'Left value not part of columns: {error_value} -> {self.footnote_keys}'
raise Exception(msg)
def toList(self):
"""Return footnotes as a list of Text.
"""
footnotes = []
for footnote_list in self.footnote_columns:
for footnote in footnote_list:
if re.match(self.REFERENCE_PATTERN, footnote.content):
footnotes.append(footnote)
elif len(footnotes) > 0:
footnotes[-1].join(footnote)
else:
print([ footnote.content for footnote in self.footnote_columns[1]])
print(self.footnote_keys)
raise Exception(f'List of footnotes empty and footnote "{footnote.content}" does not match {self.REFERENCE_PATTERN.pattern}!')
return footnotes
def _process_content_and_markup(self, node, footnote, matrix):
"""Process content and markup of node.
[:return:] (footnote: Text, left_value: float)
"""
startIndex = 0
next_text = node.text
left_value = matrix.getX()
items = [ item for item in node.findall('tspan', self.nsmap)]
if len(items) > 0:
next_text = ''.join([ item.text for item in items])
left_value = matrix.add2X(float(items[0].get('x')))
elif bool(node.get('x')):
left_value = matrix.add2X(float(node.get('x')))
if footnote != None and\
((re.match(r'.*[0-9]+:', next_text)\
and re.match(r'.*[0-9]+:', footnote.content)\
and not re.match(r'.*\d-', footnote.content))\
or (self.get_index(left_value) > -1\
and self.get_index(left_value) != self.index)):
if DEBUG and re.match(r'.*[0-9]+:', next_text)\
and not re.match(r'.*[0-9]+:', footnote.content):
print(footnote, next_text)
self.append(footnote)
footnote = None
if len(items) > 0:
for item in items:
footnote, left_value = self._process_content_and_markup(item, footnote, matrix)
else:
if footnote is None:
footnote = Text(content=next_text)
try:
self.register_index(left_value)
except Exception:
print(self.footnote_columns)
raise Exception(f'{footnote}')
else:
startIndex = footnote.append(next_text)
if bool(node.get('class')):
standoff_markups = StandoffTag.create_cls(startIndex, len(footnote.content)-1, node.get('class'), style_dict=self.style_dict)
if len(standoff_markups) > 0:
if len(footnote.standoff_markups) > 0:
standoff_markups = footnote.standoff_markups[-1].join_list(standoff_markups)
if len(standoff_markups) > 0:
footnote.standoff_markups += standoff_markups
return footnote, left_value
@staticmethod
def EXTRACT_NODES_IN_FOOTNOTE_AREA(svg_tree, transkription_field=None, marginals_on_extra_page=False) ->list:
"""Return a list of nodes that are in footnote area.
"""
if transkription_field is None and svg_tree is not None:
transkription_field = TranskriptionField(svg_tree.docinfo.URL)
nodes_in_footnote_area = [ item for item in filter(lambda node: Matrix.NODE_HAS_CONTENT_IN_FOOTNOTE_AREA(node, transkription_field,\
marginals_on_extra_page=marginals_on_extra_page),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
for node in nodes_in_footnote_area:
if not Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), transkription_field, marginals_on_extra_page=marginals_on_extra_page):
for child in node.getchildren():
if not Matrix.IS_IN_FOOTNOTE_AREA(node.get('transform'), transkription_field, x=float(child.get('x')), marginals_on_extra_page=marginals_on_extra_page):
node.remove(child)
return nodes_in_footnote_area
@staticmethod
def GET_UNIQUE_BOTTOM_VALUES(nodes_in_footnote_area) ->list:
"""Return sorted list of unique bottom values.
"""
return sorted([ bottom_value for bottom_value in set(round(Matrix(transform_matrix_string=item.get('transform')).getY(),1) for item in nodes_in_footnote_area) ])
def extract_footnotes_as_strings(transkription_field=None, svg_tree=None, svg_file=None, contains_string='', marginals_extra=False):
"""Returns all footnotes as a list of strings.
"""
if transkription_field is None and svg_file is not None:
transkription_field = TranskriptionField(svg_file)
if svg_tree is None and svg_file is not None:
svg_tree = ET.parse(svg_file)
footnotes = []
nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field, marginals_on_extra_page=marginals_extra),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ])
for bottom_value in bottom_values:
nodes_on_line = [ item for item in nodes_in_footnote_area if Matrix(transform_matrix_string=item.get('transform')).getY() == bottom_value ]
nodes_on_line = sorted(nodes_on_line, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX())
footnote_string = ''
for node in nodes_on_line:
if len(node.getchildren()) == 0:
if footnote_string != '' and re.match(r'.*[0-9]+:', node.text):
footnotes.append(footnote_string)
footnote_string = node.text
else:
footnote_string += node.text
else:
next_string = ''.join([ item.text for item in node.findall('tspan', svg_tree.getroot().nsmap)])
if footnote_string != '' and re.match(r'.*[0-9]+:', next_string):
footnotes.append(footnote_string)
footnote_string = next_string
else:
footnote_string += next_string
footnotes.append(footnote_string)
if contains_string != '':
footnotes = [ footnote_string for footnote_string in footnotes if contains_string in footnote_string ]
return footnotes
def extract_footnotes(page, transkription_field=None, svg_tree=None, svg_file=None, contains_string='', contains_strings=None, skip_after=-1.0) ->list:
"""Returns all footnotes as a list of Text.
"""
marginals_on_extra_page = False
if page.marginals_source is not None:
marginals_on_extra_page = True
svg_tree = ET.parse(page.marginals_source)
if transkription_field is None:
transkription_field = TranskriptionField(page.source)
footnote_columns = FootnoteColumns.create_cls(page=page, transkription_field=transkription_field,\
svg_tree=svg_tree, svg_file=svg_file, marginals_on_extra_page=marginals_on_extra_page, skip_after=skip_after)
if footnote_columns is None:
return []
return footnote_columns.extract_footnotes(contains_string=contains_string, contains_strings=contains_strings)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline