Index: svgscripts/datatypes/footnotes.py =================================================================== --- svgscripts/datatypes/footnotes.py (revision 82) +++ svgscripts/datatypes/footnotes.py (revision 83) @@ -1,130 +1,263 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract footnotes from a svg file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import getopt import re import sys from os import listdir, sep, path from os.path import isfile, isdir, dirname import lxml.etree as ET __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from .matrix import Matrix from .standoff_tag import StandoffTag from .text import Text from .transkriptionField import TranskriptionField UNITTESTING = False +class FootnoteColumns: + """This class represents footnote columns. + """ + REFERNCE_PATTERN = re.compile('.*[0-9]+:') + + def __init__(self, nsmap, nodes, bottom_values, style_dict): + self.bottom_values = bottom_values + self.footnote_columns = [] + self.footnote_keys = {} + self.index = 0 + self.nodes = nodes + self.nsmap = nsmap + self.style_dict = style_dict + for node in [ item for item in nodes\ + if Matrix(transform_matrix_string=item.get('transform')).getY() == self.bottom_values[0] ]: + matrix = Matrix(transform_matrix_string=node.get('transform')) + if node.getchildren() == 0: + if re.match(self.REFERNCE_PATTERN, node.text): + self.footnote_columns.append([]) + self.footnote_keys.update({ round(matrix.getX()): len(self.footnote_columns)-1 }) + else: + items = [ item for item in node.findall('tspan', self.nsmap)] + if re.match(self.REFERNCE_PATTERN, ''.join([ item.text for item in items])): + self.footnote_columns.append([]) + self.footnote_keys.update({ round(matrix.add2X(float(items[0].get('x')))): len(self.footnote_columns)-1 }) + + def append(self, footnote): + """Append footnote to a column + """ + self.footnote_columns[self.index].append(footnote) + + @classmethod + def create_cls(cls, style_dict=None, page=None, transkription_field=None, svg_tree=None, svg_file=None): + """Returns all footnotes as a list of Text. + """ + if transkription_field is None and svg_file is not None: + transkription_field = TranskriptionField(svg_file) + if svg_tree is None and svg_file is not None: + svg_tree = ET.parse(svg_file) + if style_dict is None and page is not None: + style_dict = StandoffTag.create_relevant_style_dictionary(page) + nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field),\ + svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] + bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ]) + if len(bottom_values) == 0: + return None + else: + return cls(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, style_dict) + + def extract_footnotes(self, contains_string=''): + """Returns all footnotes as a list of Text. + """ + left_value = -1 + for bottom_value in self.bottom_values: + nodes_on_line = [ item for item in self.nodes if Matrix(transform_matrix_string=item.get('transform')).getY() == bottom_value ] + nodes_on_line = sorted(nodes_on_line, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX()) + footnote = None + matrix = None + for node in nodes_on_line: + matrix = Matrix(transform_matrix_string=node.get('transform')) + footnote, left_value = self._process_content_and_markup(node, footnote, matrix) + if footnote is not None: + self.append(footnote) + footnotes = self.toList() + if contains_string != '': + footnotes = [ footnote for footnote in footnotes if contains_string in footnote.content ] + return footnotes + + def get_index(self, left_value) -> int: + """Return index of column for left value. + """ + index = -1 + if round(left_value) in self.footnote_keys.keys(): + index = self.footnote_keys[round(left_value)] + else: + for key, value in self.footnote_keys.items(): + if abs(key - round(left_value)) < 2: + index = value + break + return index + + def register_index(self, left_value): + """Register index for next column to be used. + """ + index = self.get_index(left_value) + if index > -1: + self.index = index + else: + error_value = round(left_value) + msg = f'Left value not part of columns: {error_value} -> {self.footnote_keys}' + raise Exception(msg) + + def toList(self): + """Return footnotes as a list of Text. + """ + footnotes = [] + for footnote_list in self.footnote_columns: + for footnote in footnote_list: + if re.match(self.REFERNCE_PATTERN, footnote.content): + footnotes.append(footnote) + else: + footnotes[-1].join(footnote) + return footnotes + + def _process_content_and_markup(self, node, footnote, matrix): + """Process content and markup of node. + + [:return:] (footnote: Text, left_value: float) + """ + startIndex = 0 + next_text = node.text + left_value = matrix.getX() + items = [ item for item in node.findall('tspan', self.nsmap)] + if len(items) > 0: + next_text = ''.join([ item.text for item in items]) + left_value = matrix.add2X(float(items[0].get('x'))) + if footnote != None and\ + (re.match(r'.*[0-9]+:', next_text)\ + or (self.get_index(left_value) > -1\ + and self.get_index(left_value) != self.index)): + self.append(footnote) + footnote = None + if len(items) > 0: + for item in items: + footnote, left_value = self._process_content_and_markup(item, footnote, matrix) + else: + if footnote is None: + footnote = Text(content=next_text) + self.register_index(left_value) + else: + startIndex = footnote.append(next_text) + if bool(node.get('class')): + standoff_markups = StandoffTag.create_cls(startIndex, len(footnote.content), node.get('class'), style_dict=self.style_dict) + if len(standoff_markups) > 0: + if len(footnote.standoff_markups) > 0: + standoff_markups = footnote.standoff_markups[-1].join_list(standoff_markups) + if len(standoff_markups) > 0: + footnote.standoff_markups += standoff_markups + return footnote, left_value def extract_footnotes_as_strings(transkription_field=None, svg_tree=None, svg_file=None, contains_string=''): """Returns all footnotes as a list of strings. """ if transkription_field is None and svg_file is not None: transkription_field = TranskriptionField(svg_file) if svg_tree is None and svg_file is not None: svg_tree = ET.parse(svg_file) footnotes = [] nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field),\ svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ]) for bottom_value in bottom_values: nodes_on_line = [ item for item in nodes_in_footnote_area if Matrix(transform_matrix_string=item.get('transform')).getY() == bottom_value ] nodes_on_line = sorted(nodes_on_line, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX()) footnote_string = '' for node in nodes_on_line: if len(node.getchildren()) == 0: if footnote_string != '' and re.match(r'.*[0-9]+:', node.text): footnotes.append(footnote_string) footnote_string = node.text else: footnote_string += node.text else: next_string = ''.join([ item.text for item in node.findall('tspan', svg_tree.getroot().nsmap)]) if footnote_string != '' and re.match(r'.*[0-9]+:', next_string): footnotes.append(footnote_string) footnote_string = next_string else: footnote_string += next_string footnotes.append(footnote_string) if contains_string != '': footnotes = [ footnote_string for footnote_string in footnotes if contains_string in footnote_string ] return footnotes def extract_footnotes(page, transkription_field=None, svg_tree=None, svg_file=None, contains_string=''): """Returns all footnotes as a list of Text. """ - if transkription_field is None and svg_file is not None: - transkription_field = TranskriptionField(svg_file) - if svg_tree is None and svg_file is not None: - svg_tree = ET.parse(svg_file) - footnotes = [] - style_dict = StandoffTag.create_relevant_style_dictionary(page) - nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field),\ - svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] - bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ]) - for bottom_value in bottom_values: - nodes_on_line = [ item for item in nodes_in_footnote_area if Matrix(transform_matrix_string=item.get('transform')).getY() == bottom_value ] - nodes_on_line = sorted(nodes_on_line, key=lambda x: Matrix(transform_matrix_string=x.get('transform')).getX()) - footnote = None - for node in nodes_on_line: - footnote = _process_content_and_markup(node, footnote, footnotes, style_dict, svg_tree) - if footnote is not None: - footnotes.append(footnote) - if contains_string != '': - footnotes = [ footnote for footnote in footnotes if contains_string in footnote.content ] - return footnotes + footnote_columns = FootnoteColumns.create_cls(page=page, transkription_field=transkription_field,\ + svg_tree=svg_tree, svg_file=svg_file) + return footnote_columns.extract_footnotes(contains_string=contains_string) + +def _process_content_and_markup(node, footnote, footnote_columns, style_dict, svg_tree, matrix): + """Process content and markup of node. -def _process_content_and_markup(node, footnote, footnotes, style_dict, svg_tree): + [:return:] (footnote: Text, left_value: float) + """ startIndex = 0 next_text = node.text - if len(node.getchildren()) > 0: - next_text = ''.join([ item.text for item in node.findall('tspan', svg_tree.getroot().nsmap)]) - if footnote != None and re.match(r'.*[0-9]+:', next_text): - footnotes.append(footnote) + left_value = matrix.getX() + items = [ item for item in node.findall('tspan', svg_tree.getroot().nsmap)] + if len(items) > 0: + next_text = ''.join([ item.text for item in items]) + left_value = matrix.add2X(float(items[0].get('x'))) + if footnote != None and\ + (re.match(r'.*[0-9]+:', next_text)\ + or (footnote_columns.get_index(left_value) > -1\ + and footnote_columns.get_index(left_value) != footnote_columns.index)): + footnote_columns.append(footnote) footnote = None - if len(node.getchildren()) > 0: - for item in node.findall('tspan', svg_tree.getroot().nsmap): - footnote = _process_content_and_markup(item, footnote, footnotes, style_dict, svg_tree) + if len(items) > 0: + for item in items: + footnote, left_value = _process_content_and_markup(item, footnote, footnote_columns, style_dict, svg_tree, matrix) else: if footnote is None: footnote = Text(content=next_text) + footnote_columns.register_index(left_value) else: startIndex = footnote.append(next_text) if bool(node.get('class')): standoff_markups = StandoffTag.create_cls(startIndex, len(footnote.content), node.get('class'), style_dict=style_dict) if len(standoff_markups) > 0: if len(footnote.standoff_markups) > 0: standoff_markups = footnote.standoff_markups[-1].join_list(standoff_markups) if len(standoff_markups) > 0: footnote.standoff_markups += standoff_markups - return footnote + return footnote, left_value if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/datatypes/text.py =================================================================== --- svgscripts/datatypes/text.py (revision 82) +++ svgscripts/datatypes/text.py (revision 83) @@ -1,104 +1,114 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a text that may have standoff markup. """ # Copyright (C) University of Basel 2020 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import re import sys from .attachable_object import AttachableObject from .standoff_tag import StandoffTag sys.path.append('py2ttl') from class_spec import SemanticClass class Text(AttachableObject,SemanticClass): """ This class represents a text that may have standoff markup. """ XML_TAG = 'text-with-markup' XML_SUB_TAG = 'text' def __init__(self, content: str, standoff_markups=None, id=0, tag=XML_TAG): self.id = str(id) self.tag = tag self.content = content self.standoff_markups = standoff_markups\ if standoff_markups is not None\ else [] def append(self, content: str) -> int: """Extend text with content. [:return:] startIndex of appended content """ startIndex = len(self.content) self.content += content return startIndex def attach_object_to_tree(self, target_tree): """Attach object to tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree, self.tag) obj_node.set('id', self.id) text_node = ET.SubElement(obj_node, self.XML_SUB_TAG) text_node.text = self.content for index, markup in enumerate(self.standoff_markups): markup.id = str(index) markup.attach_object_to_tree(obj_node) + def join(self, other): + """Join self and other. + """ + correction = self.append(' ' + other.content) + 1 + for standoff_markup in other.standoff_markups: + standoff_markup.startIndex += correction + standoff_markup.endIndex += correction + self.standoff_markups += other.standoff_markups + del other + @classmethod def create_cls_from_node(cls, node): """Initialize a cls from node. [:return:] cls """ standoff_markups = [ StandoffTag.create_cls_from_node(item) for item in\ node.xpath('./' + '|./'.join(StandoffTag.MARKUP_STYLES)) ] text = node.xpath('./' + cls.XML_SUB_TAG + '/text()')[0]\ if len(node.xpath('./' + cls.XML_SUB_TAG + '/text()')) > 0\ else '' return cls(text, standoff_markups=standoff_markups, id=node.get('id'), tag=node.tag) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ properties = {} properties.update(cls.create_semantic_property_dictionary('content', str, cardinality=1,\ name='textHasContent', label='content of text', comment='Connects a text with its content.')) properties.update(cls.create_semantic_property_dictionary('standoff_markups', list,\ name='textHasMarkup', label='standoff tag of text', comment='Connects a text with a list of standoff tags.')) dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties } return cls.return_dictionary_after_updating_super_classes(dictionary) Index: svgscripts/datatypes/standoff_tag.py =================================================================== --- svgscripts/datatypes/standoff_tag.py (revision 82) +++ svgscripts/datatypes/standoff_tag.py (revision 83) @@ -1,139 +1,138 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent the standoff markup of a text. """ # Copyright (C) University of Basel 2020 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import re import sys from .attachable_object import AttachableObject sys.path.append('py2ttl') from class_spec import SemanticClass class StandoffTag(AttachableObject,SemanticClass): """ This class represents the standoff markup of a text. """ MARKUP_STYLES = [ 'bold', 'italic' ] RELEVANT_STYLE_KEY = 'font-family' RELEVANT_CONTENT_STARTSWITH = 'Frutiger-' RELEVANT_PATTERN = re.compile('.*(Italic|Bold)$') RELEVANT_SUB_PATTERN = re.compile('Frutiger-(Light)*') def __init__(self, markup: str, startIndex: int, endIndex: int, id=0): self.id = str(id) self.markup = markup self.startIndex = startIndex self.endIndex = endIndex def attach_object_to_tree(self, target_tree): """Attach object to tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() obj_node = target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree, self.markup) obj_node.set('id', self.id) obj_node.set('start', str(self.startIndex)) obj_node.set('end', str(self.endIndex)) @classmethod def create_cls(cls, start_index, end_index, style_string, page=None, style_dict=None): """Creates a StandoffTag from a style_string. :return: a list of (datatypes.standoff_tag) StandoffTag """ if page is not None: style_dict = cls.create_relevant_style_dictionary(page) relevant_keys = [ key for key in set(style_string.split(' '))\ if key in style_dict.keys() ] standoff_tags = [] if style_dict is None or len(style_dict) == 0: return standoff_tags for relevant_key in relevant_keys: font_family = style_dict[relevant_key][cls.RELEVANT_STYLE_KEY] if re.match(cls.RELEVANT_PATTERN, font_family): markup = re.sub(cls.RELEVANT_SUB_PATTERN, '', font_family).lower() standoff_tags.append(cls(markup, start_index, end_index)) return standoff_tags @classmethod def create_cls_from_node(cls, node): """Creates a StandoffTag from a node. :return: (datatypes.standoff_tag) StandoffTag """ return cls(node.tag, int(node.get('start')), int(node.get('end')), id=node.get('id')) @classmethod def create_relevant_style_dictionary(cls, page): """Return a style dictionary that contains only relevant keys and contents. """ return { key: key_dict for key, key_dict in page.style_dict.items()\ if cls.RELEVANT_STYLE_KEY in key_dict.keys()\ and key_dict[cls.RELEVANT_STYLE_KEY].startswith(cls.RELEVANT_CONTENT_STARTSWITH) } @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ properties = {} properties.update(cls.create_semantic_property_dictionary('markup', str, cardinality=1,\ name='standoffTagHasMarkup', label='standoff tag has a specific markup', comment='Connects a standoff tag with its markup, e.g. bold or italic')) properties.update(cls.create_semantic_property_dictionary('startIndex', int, cardinality=1,\ name='standoffTagHasStartIndex', label='standoff tag has a start index', comment='Connects a standoff tag with its start index.')) properties.update(cls.create_semantic_property_dictionary('endIndex', int, cardinality=1,\ name='standoffTagHasEndIndex', label='standoff tag has a end index', comment='Connects a standoff tag with its end index.')) dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties } return cls.return_dictionary_after_updating_super_classes(dictionary) - def is_joinable(self, other): """Return true if self and other have same markup and self.endIndex == other.startIndex. """ return self.markup == other.markup and self.endIndex == other.startIndex def join(self, other): """Join self with other. """ self.endIndex = other.endIndex def join_list(self, others): """Join all others that are joinable, return remaining others as a list. """ unjoinable_others = [] for other in others: if self.is_joinable(other): self.join(other) else: unjoinable_others.append(other) return unjoinable_others Index: svgscripts/datatypes/matrix.py =================================================================== --- svgscripts/datatypes/matrix.py (revision 82) +++ svgscripts/datatypes/matrix.py (revision 83) @@ -1,296 +1,305 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to transform a svg/text[@transform] matrix-string into a matrix representation. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import re import math class Matrix: """ This class transforms a svg @transform matrix-string into a matrix representation. Args: transform_matrix_string (str) string of the form 'matrix(1.0 0.0 0.0 1.0 0.0 0.0)' or 'rotate(10)' """ A = 0 B = 1 C = 2 D = 3 E = 4 F = 5 XINDEX = 4 YINDEX = 5 MATRIX_LENGTH = 6 DOWN = 1 STRAIGHT = 0 UP = -1 def __init__(self, transform_matrix_string=None, transkription_field=None, matrix_list=[]): self.matrix = [ 0.0 for i in range(Matrix.MATRIX_LENGTH) ] if len(matrix_list) < 6 else matrix_list if transform_matrix_string is not None: m = re.search('(?<=rotate\()[-]*[0-9]+', transform_matrix_string) if m is not None: # transform='rotate(a)' to transform='matrix(cos(a), sin(a), -sin(a), cos(a), 0, 0)' angle = float(m.group(0)) self.matrix[Matrix.A] = round(math.cos(math.radians(angle)), 3) self.matrix[Matrix.B] = round(math.sin(math.radians(angle)), 3) self.matrix[Matrix.C] = round(math.sin(math.radians(angle))*-1, 3) self.matrix[Matrix.D] = round(math.cos(math.radians(angle)), 3) self.matrix[Matrix.E] = 0 self.matrix[Matrix.F] = 0 elif re.search(r'matrix\(\s*([-]*\d+(\.\d+(e-\d+)*)*[,\s][\s]*){5}[-]*\d+(\.\d+)*.*\s*\)', transform_matrix_string): #elif re.search(r'matrix\(\s*([-]*[0-9].*\s){5}[-]*[0-9].*\s*\)', transform_matrix_string): # old-> does not include comma separated matrix string self.matrix = [ float(i) for i in transform_matrix_string.replace('matrix(','').\ replace(', ', ',').replace(',', ' ').replace(')','').split(' ') ] else: raise Exception('Error: string "{}" is not a valid transform matrix string!'.format(transform_matrix_string)) if transkription_field is not None: self.matrix[Matrix.XINDEX] -= transkription_field.xmin self.matrix[Matrix.YINDEX] -= transkription_field.ymin if(len(self.matrix) < Matrix.MATRIX_LENGTH): raise Exception('Error: string "{}" is not a valid matrix string!'.format(transform_matrix_string)) def add2X(self, add_to_x=0): """Return x-value of matrix (float) + add_to_x. """ return self.matrix[Matrix.XINDEX] + float(add_to_x) def add2Y(self, add_to_y=0): """Return y-value of matrix (float) + add_to_y. """ return self.matrix[Matrix.YINDEX] + float(add_to_y) def getX(self): """Return x-value of matrix (float). """ return self.matrix[Matrix.XINDEX] def getY(self): """Return y-value of matrix (float). """ return self.matrix[Matrix.YINDEX] def is_matrix_horizontal(self): """Returns whether matrix is horizontal. [:return:] True/False """ return self.matrix[Matrix.A] == 1 and self.matrix[Matrix.B] == 0 and self.matrix[Matrix.C] == 0 and self.matrix[Matrix.D] == 1 def get_new_x(self, x=0.0, y=0.0): """Returns new position of x. :return: (float) x """ top_left_x = x - self.matrix[self.E] if x != 0.0 else 0.0 top_left_y = y - self.matrix[self.F] if y != 0.0 else 0.0 return self.matrix[Matrix.A] * top_left_x + self.matrix[Matrix.C] * top_left_y + self.matrix[self.E] def get_new_y(self, x=0.0, y=0.0): """Returns new position of y. :return: (float) y """ top_left_x = x - self.matrix[self.E] if x != 0.0 else 0.0 top_left_y = y - self.matrix[self.F] if y != 0.0 else 0.0 return self.matrix[Matrix.B] * top_left_x + self.matrix[Matrix.D] * top_left_y + self.matrix[self.F] def get_old_x(self, x=0.0, y=0.0): """Returns old position of x. :return: (float) x """ old_x = (self.matrix[self.D]*x - self.matrix[Matrix.D]*self.matrix[Matrix.E] - self.matrix[Matrix.C]*y + self.matrix[Matrix.C]*self.matrix[Matrix.F])\ /(self.matrix[Matrix.A]*self.matrix[Matrix.D] - self.matrix[Matrix.B]*self.matrix[Matrix.C]) return self.add2X(old_x) def get_transformed_positions(self, x=0.0, y=0.0, width=0.0, height=0.0): """Returns transformed x, y, width and height. """ top_left_x = x top_left_y = y top_right_x = x + width top_right_y = y bottom_left_x = x bottom_left_y = y + height bottom_right_x = x + width bottom_right_y = y + height new_x = self.matrix[Matrix.A] * top_left_x + self.matrix[Matrix.C] * top_left_y + self.matrix[self.E] new_y = self.matrix[Matrix.B] * top_left_x + self.matrix[Matrix.D] * top_left_y + self.matrix[self.F] new_top_right_x = self.matrix[Matrix.A] * top_right_x + self.matrix[Matrix.C] * top_right_y + self.matrix[self.E] new_top_right_y = self.matrix[Matrix.B] * top_right_x + self.matrix[Matrix.D] * top_right_y + self.matrix[self.F] new_bottom_left_x = self.matrix[Matrix.A] * bottom_left_x + self.matrix[Matrix.C] * bottom_left_y + self.matrix[self.E] new_bottom_left_y = self.matrix[Matrix.B] * bottom_left_x + self.matrix[Matrix.D] * bottom_left_y + self.matrix[self.F] new_bottom_right_x = self.matrix[Matrix.A] * bottom_right_x + self.matrix[Matrix.C] * bottom_right_y + self.matrix[self.E] new_bottom_right_y = self.matrix[Matrix.B] * bottom_right_x + self.matrix[Matrix.D] * bottom_right_y + self.matrix[self.F] new_width = abs(new_top_right_x - new_x)\ if abs(new_top_right_x - new_x) >= abs(new_bottom_right_x - new_bottom_left_x)\ else abs(new_bottom_right_x - new_bottom_left_x) new_height = abs(new_bottom_left_y - new_y)\ if abs(new_bottom_left_y - new_y) >= abs(new_top_right_y - new_bottom_right_y)\ else abs(new_top_right_y - new_bottom_right_y) return new_x, new_y, new_width, new_height def clone_transformation_matrix(self): """Returns a matrix that contains only the transformation part. [:return:] (Matrix) a clone of this matrix """ return Matrix(matrix_list=self.matrix[0:4]+[0,0]) def isRotationMatrix(self): """Return whether matrix is a rotation matrix. """ return self.matrix[Matrix.A] < 1 or self.matrix[Matrix.B] != 0 def toCSSTransformString(self): """Returns the CSS3 transform string: 'rotate(Xdeg)' where X is the angle. """ angle = 0 if self.isRotationMatrix(): angle = int(round(math.degrees(math.asin(self.matrix[Matrix.B])), 0)) if angle == 0: angle = int(round(math.degrees(math.acos(self.matrix[Matrix.A])), 0)) return 'rotate({}deg)'.format(angle) def toString(self): """Returns a transform_matrix_string representation of the matrix. [:returns:] (str) 'matrix(X X X X X X)' """ return 'matrix(' + ' '.join([ str(round(x, 5)) for x in self.matrix ]) + ')' def get_rotation_direction(self): """Get rotation direction of rotation matrix. [:return:] (int) direction code Matrix.UP, Matrix.STRAIGHT, Matrix.DOWN """ if not self.isRotationMatrix(): return self.STRAIGHT else: angle = int(round(math.degrees(math.asin(self.matrix[Matrix.B])), 0)) return self.UP if angle < 0 else self.DOWN @staticmethod def IS_IN_FOOTNOTE_AREA(transform_matrix_string, transkription_field): """Returns true if matrix specifies a position that is part of the footnote area. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ matrix = Matrix(transform_matrix_string=transform_matrix_string) if matrix.getY() < transkription_field.ymax: return False is_part = matrix.getX() > transkription_field.xmin\ if transkription_field.is_page_verso()\ else matrix.getX() > transkription_field.documentWidth/4 return is_part @staticmethod def IS_IN_MARGIN_FIELD(transform_matrix_string, transkription_field): """Returns true if matrix specifies a position that is part of the margin field. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ line_number_area_width = 15\ if transkription_field.line_number_area_width == 0.0\ else transkription_field.line_number_area_width matrix = Matrix(transform_matrix_string=transform_matrix_string) if matrix.getY() < transkription_field.ymin or matrix.getY() > transkription_field.ymax: return False is_part = matrix.getX() < transkription_field.xmin - line_number_area_width\ if transkription_field.is_page_verso()\ else matrix.getX() > transkription_field.xmax + line_number_area_width return is_part @staticmethod def IS_IN_PLACE_OF_PRINTING_AREA(transform_matrix_string, transkription_field): """Returns true if matrix specifies a position that is part of the area where the places of printing ('Druckorte') are printed. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ matrix = Matrix(transform_matrix_string=transform_matrix_string) if matrix.getY() < transkription_field.ymax: return False is_part = matrix.getX() < transkription_field.xmin\ if transkription_field.is_page_verso()\ else matrix.getX() < transkription_field.documentWidth/4 return is_part @staticmethod def IS_PART_OF_TRANSKRIPTION_FIELD(transkription_field, text_node=None, matrix=None): """Returns true if matrix specifies a position that is part of transkription field. text_node (lxml.etree.Element) transkription_field (datatypes.transkription_field.TranskriptionField) """ if matrix is None and not bool(text_node.get('transform')): return False if matrix is None: matrix = Matrix(transform_matrix_string=text_node.get('transform')) is_part = matrix.getX() > transkription_field.xmin and matrix.getX() < transkription_field.xmax\ and matrix.getY() > transkription_field.ymin and matrix.getY() < transkription_field.ymax if not is_part and matrix.isRotationMatrix() and len([child.text for child in text_node.getchildren() if not re.match(r'^\s*$', child.text)]) > 0: first_tspan_node = [ child for child in text_node.getchildren() if not re.match(r'^\s*$', child.text)][0] x = matrix.add2X(float(first_tspan_node.get('x'))) y = matrix.add2Y(float(first_tspan_node.get('y'))) new_x = matrix.get_new_x(x=x, y=y) new_y = matrix.get_new_y(x=x, y=y) return new_x > transkription_field.xmin and new_x < transkription_field.xmax\ and new_y > transkription_field.ymin and new_y < transkription_field.ymax return is_part @staticmethod def IS_NEARX_TRANSKRIPTION_FIELD(transform_matrix_string, transkription_field, diffx=20.0): """Returns true if matrix specifies a position that is on its x axis near the transkription_field. transform_matrix_string (str): string from which to init Matrix. transkription_field (svgscripts.TranskriptionField) diffx (float): defines threshold for positions that count as near. """ matrix = Matrix(transform_matrix_string=transform_matrix_string) MINLEFT = transkription_field.xmin - diffx MAXRIGHT = transkription_field.xmax + diffx return matrix.getY() > transkription_field.ymin and matrix.getY() < transkription_field.ymax\ and ((matrix.getX() > MINLEFT and matrix.getX() < transkription_field.xmin)\ or (matrix.getX() > transkription_field.xmax and matrix.getX() < MAXRIGHT)) @staticmethod def DO_CONVERSION_FACTORS_DIFFER(matrix_a, matrix_b, diff_threshold=0.001): """Returns whether the conversion factors (a-d) differ more than diff_threshold. """ if matrix_a is None or matrix_b is None: return not (matrix_a is None and matrix_b is None) return abs(matrix_a.matrix[Matrix.A] - matrix_b.matrix[Matrix.A]) > diff_threshold\ or abs(matrix_a.matrix[Matrix.B] - matrix_b.matrix[Matrix.B]) > diff_threshold\ or abs(matrix_a.matrix[Matrix.C] - matrix_b.matrix[Matrix.C]) > diff_threshold\ or abs(matrix_a.matrix[Matrix.D] - matrix_b.matrix[Matrix.D]) > diff_threshold + def __eq__(self, other): + """Return self.matrix == other.matrix. + """ + return self.matrix == other.matrix + + def __hash__(self): + """Return hash value. + """ + return hash((self.matrix[Matrix.E], self.matrix[Matrix.F])) Index: svgscripts/datatypes/transkriptionField.py =================================================================== --- svgscripts/datatypes/transkriptionField.py (revision 82) +++ svgscripts/datatypes/transkriptionField.py (revision 83) @@ -1,168 +1,177 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to transform a svg file according to the dimension of its transkription field. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} -import sys -from os.path import exists -from svgpathtools import svg_to_paths -import xml.etree.ElementTree as ET -from xml.parsers.expat import ExpatError __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __version__ = "0.0.1" +import sys +from os.path import exists +from svgpathtools import svg_to_paths +import xml.etree.ElementTree as ET +from xml.parsers.expat import ExpatError + +from .matrix import Matrix + class TranskriptionField: """ A class containing the dimensions of the transkription field. Args: filename (str): name of the svg file """ def __init__(self, filename): self.width = 0.0 self.height = 0.0 self.xmin = 0.0 self.xmax = 0.0 self.ymin = 0.0 self.ymax = 0.0 self.documentWidth = 0.0 self.documentHeight = 0.0 self.path = None self.filename = filename self.line_number_area_width = 0.0 MAX_SMALLER_PATH_WIDTH = 50.0 MAX_SMALLER_PATH_HEIGHT = 50.0 MAX_DIFF_DOC_SELF_WIDTH = 100.0 MAX_DIFF_DOC_SELF_HEIGHT = 100.0 try: paths, attributes, self.svg_attributes = svg_to_paths.svg2paths(filename, return_svg_attributes=True) except ExpatError: #tb = sys.exc_info()[2] raise ExpatError('File {} is empty!'.format(filename))#.with_traceback(tb) if len(self.svg_attributes) > 0 and bool(self.svg_attributes.get('viewBox')): viewBox = (self.svg_attributes['viewBox'].split()) else: raise Exception('File "{}" does not have an attribute "viewBox"'.format(filename)) self.documentWidth = float(viewBox[2]) self.documentHeight = float(viewBox[3]) - for path in paths: + for index, path in enumerate(paths): if bool(path): try: if path.iscontinuous() and path.isclosed(): xmin, xmax, ymin, ymax = path.bbox() width = xmax - xmin height = ymax - ymin + if 'transform' in attributes[index].keys(): + matrix = Matrix(attributes[index]['transform']) + xmin, ymax, width, height = matrix.get_transformed_positions(xmin, ymin, width, height) + xmax = matrix.get_new_x() + ymin = matrix.get_new_y() + #TODO fix this!!! if (width > self.width and height > self.height)\ and width > MAX_SMALLER_PATH_WIDTH\ and height > MAX_SMALLER_PATH_HEIGHT\ and self.documentWidth - width > MAX_DIFF_DOC_SELF_WIDTH\ and self.documentHeight - height > MAX_DIFF_DOC_SELF_HEIGHT: self.xmin = xmin self.xmax = xmax self.ymin = ymin self.ymax = ymax self.width = width self.height = height self.path = path except AssertionError: print("Assertion Error") if self.is_shrunk(): self.xmin = float(viewBox[0]) self.ymin = float(viewBox[1]) def add_line_number_area_width(self, end_positionX_of_line_number_area): """Adds the width of the line number area. """ if self.is_page_verso(): self.line_number_area_width = self.xmin - end_positionX_of_line_number_area else: self.line_number_area_width = end_positionX_of_line_number_area - self.xmax def is_page_verso(self): """Returns true if the area right of the TranskriptionField is less than the left area. """ return self.documentWidth-self.xmax < self.xmin def is_shrunk(self): """Returns True if viewbox[0] and viewBox[1] != 0. """ if len(self.svg_attributes) == 0 or not bool(self.svg_attributes.get('viewBox')): return False viewBox = self.svg_attributes['viewBox'].split() return float(viewBox[0]) != 0 and float(viewBox[1]) != 0 def get_svg_attributes(self, attrib_key): """Returns the svg attribute for the corresponding key or None if empty. """ if self.svg_attributes is None or len(self.svg_attributes) == 0 or not bool(self.svg_attributes.get(attrib_key)): return None return self.svg_attributes[attrib_key] def shrink_svg_to_transkription_field(self, target_filename=None): """ Changes the viewBox of the svg graphics to the size of the transkription field. If a target_filename is specified, the changes are saved to a new file, otherwise they are saved to the input file. Args: target_filename (str): name of the target svg file """ if bool(self.svg_attributes.get('xmlns')): ET.register_namespace('', self.svg_attributes['xmlns']) if bool(self.svg_attributes.get('xmlns:xlink')): ET.register_namespace('xlink', self.svg_attributes['xmlns:xlink']) et = ET.parse(self.filename) root = et.getroot() if bool(root.attrib.get('viewBox')): if(not self.is_shrunk()): root.attrib['viewBox'] = '{} {} {} {}'.format(self.xmin, self.ymin, self.width, self.height) if bool(root.attrib.get('width')): root.attrib['width'] = '{}pt'.format(self.width) if bool(root.attrib.get('height')): root.attrib['height'] = '{}pt'.format(self.height) if not bool(target_filename): target_filename = self.filename et.write(target_filename) return 0 else: #print('File {} already transformed!'.format(self.filename)) return 1 else: print('ERROR: file {} does not contain a svg/@viewBox!'.format(self.filename)) #TODO: throw error return 2 def transkription_field_found(self): """ Returns whether transkription field was found in __init__ :return: True/False """ return self.width > 0.0 and self.height > 0.0 and self.xmin > 0.0 and self.xmax > 0.0 and self.ymin > 0.0 and self.ymax > 0.0 def getWidth(self): """Returns documentWidth """ return self.documentWidth def getHeight(self): """Returns documentHeight if not is_shrunk, else height. """ return self.documentHeight Index: error_log.xml =================================================================== --- error_log.xml (revision 82) +++ error_log.xml (revision 83) @@ -1,12 +1,12 @@ xmlErrorLog 2019-06-18 18:31:49 2019-08-02 09:46:40 - 2019-12-03 17:56:25 + 2020-01-13 09:52:21 Index: tests_svgscripts/test_data/N_VII_1_page006.xml =================================================================== --- tests_svgscripts/test_data/N_VII_1_page006.xml (revision 82) +++ tests_svgscripts/test_data/N_VII_1_page006.xml (revision 83) @@ -1,1276 +1,1276 @@ svgWordPosition 2019-08-02 15:17:37 2019-08-02 15:17:37 2019-08-02 15:30:59 2019-08-02 15:30:59 - 2020-01-08 16:14:25 + 2020-01-15 13:53:51 Index: tests_svgscripts/test_text.py =================================================================== --- tests_svgscripts/test_text.py (revision 82) +++ tests_svgscripts/test_text.py (revision 83) @@ -1,44 +1,55 @@ import unittest from os import sep, path from os.path import dirname, basename, isfile, isdir import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.page import Page from datatypes.standoff_tag import StandoffTag from datatypes.text import Text class TestText(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' if not isdir(DATADIR): DATADIR = dirname(dirname(__file__)) + sep + 'test_data' self.test_file = DATADIR + sep + 'test.xml' self.test_svg_file = DATADIR + sep + 'test421.svg' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml' self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_page = DATADIR + sep + 'N_VII_1_page001.xml' self.test_manuscript = DATADIR + sep + 'N_VII_1.xml' def test_semantic(self): pass #print(Text.get_semantic_dictionary()) def test_attach_to_tree(self): empty_tree = ET.ElementTree(ET.Element('page')) content = 'asdf' standoff_tag = StandoffTag('bold', 0, len(content)-1) standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content),id='1') text = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ]) text.attach_object_to_tree(empty_tree) text = Text.create_cls_from_node(empty_tree.xpath('//' + Text.XML_TAG)[0]) self.assertEqual(text.content, content) self.assertEqual(text.id, '0') self.assertEqual(len(text.standoff_markups), 2) #print(ET.dump(empty_tree.getroot())) + def test_join(self): + content = 'asdfa' + standoff_tag = StandoffTag('bold', 0, len(content)-2) + standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content)-1,id='1') + textA = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ]) + standoff_tag = StandoffTag('bold', 0, len(content)-2) + standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content)-1,id='1') + textB = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ]) + textA.join(textB) + self.assertEqual(textA.content, content + ' ' + content) + if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_footnotes.py =================================================================== --- tests_svgscripts/test_footnotes.py (revision 82) +++ tests_svgscripts/test_footnotes.py (revision 83) @@ -1,36 +1,52 @@ import unittest from os import sep, path, remove from os.path import isdir, isfile, dirname import shutil import sys import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') -from datatypes.footnotes import extract_footnotes, extract_footnotes_as_strings, UNITTESTING +from datatypes.footnotes import FootnoteColumns, extract_footnotes, extract_footnotes_as_strings, UNITTESTING +from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField class TestExtractFootnotes(unittest.TestCase): def setUp(self): UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_footnote = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_footnote_verso = DATADIR + sep + 'N_VII_1_xp5_4_page5.svg' self.test_footnote_recto = DATADIR + sep + 'N_VII_1_xp5_4_page6.svg' self.test_footnote_multi = DATADIR + sep + 'N_VII_1_xp5_4_page13.svg' self.test_footnote_multi_xml = DATADIR + sep + 'N_VII_1_page013.xml' def test_extract_footnotes(self): footnotes = extract_footnotes_as_strings(svg_file=self.test_footnote_multi, contains_string='Anschlußzeichen') self.assertEqual(len(footnotes), 4) page = Page(self.test_footnote_multi_xml) footnotes = extract_footnotes(page, svg_file=self.test_footnote_multi, contains_string='Anschlußzeichen') - #print(footnotes) self.assertEqual(len(footnotes), 4) + footnotes = extract_footnotes(page, svg_file=self.test_footnote_multi) + self.assertEqual(len(footnotes), 7) + + def test_columns(self): + svg_tree = ET.parse(self.test_footnote_multi) + transkription_field = TranskriptionField(self.test_footnote_multi) + nodes_in_footnote_area = [ item for item in filter(lambda x: Matrix.IS_IN_FOOTNOTE_AREA(x.get('transform'), transkription_field),\ + svg_tree.getroot().iterfind('.//text', svg_tree.getroot().nsmap))] + bottom_values = sorted([ bottom_value for bottom_value in set(Matrix(transform_matrix_string=item.get('transform')).getY() for item in nodes_in_footnote_area) ]) + footnote_columns = FootnoteColumns(svg_tree.getroot().nsmap, nodes_in_footnote_area, bottom_values, None) + self.assertEqual(len(footnote_columns.footnote_columns), 2) + footnote_columns.register_index(184) + footnote_columns.append('asdf') + self.assertEqual(len(footnote_columns.footnote_columns[0]), 1) + #print(footnote_columns.footnote_columns[0]) + if __name__ == "__main__": unittest.main()