Page MenuHomec4science

footnotes.py
No OneTemporary

File Metadata

Created
Tue, May 14, 12:18

footnotes.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to extract footnotes from a svg file.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import getopt
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from .matrix import Matrix
from .transkriptionField import TranskriptionField
UNITTESTING = False
def extract_footnotes_as_strings(transkription_field=None, svg_tree=None, svg_file=None, contains_string=''):
"""Returns all footnotes as a list of strings.
"""
if transkription_field is None and svg_file is not None:
transkription_field = TranskriptionField(svg_file)
if svg_tree is None and svg_file is not None:
svg_tree = ET.parse(svg_file)
footnotes = []
nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field),\
svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))]
bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ])
for bottom_value in bottom_values:
nodes_on_line = [ item for item in nodes_in_footnote_area if Matrix(transform_matrix_string=item.get('transform')).getY() == bottom_value ]
nodes_on_line = sorted(nodes_on_line, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX())
footnote_string = ''
for node in nodes_on_line:
if len(node.getchildren()) == 0:
if footnote_string != '' and re.match(r'.*[0-9]+:', node.text):
footnotes.append(footnote_string)
footnote_string = node.text
else:
footnote_string += node.text
else:
next_string = ''.join([ item.text for item in node.findall('tspan', svg_tree.getroot().nsmap)])
if footnote_string != '' and re.match(r'.*[0-9]+:', next_string):
footnotes.append(footnote_string)
footnote_string = next_string
else:
footnote_string += next_string
footnotes.append(footnote_string)
if contains_string != '':
footnotes = [ footnote_string for footnote_string in footnotes if contains_string in footnote_string ]
return footnotes
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

Event Timeline