Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 12) +++ svgscripts/datatypes/page.py (revision 13) @@ -1,238 +1,236 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <https://www.gnu.org/licenses/> 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile #from myxmlwriter import write_pretty from .class_spec import SemanticClass from .image import Image from .word import Word from .lineNumber import LineNumber from .word_insertion_mark import WordInsertionMark from .transkriptionField import TranskriptionField class Page(SemanticClass): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. xml_target_file (str): name of the xml file to which page info will be written. """ - class_dictionary = {} + RDF_SUBCLASSES = ['http://www.knora.org/ontology/0000/information-carrier#Page'] + def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, pdfFile=None, svg_file=None, extract_transkription_field_only=False): self.title = title self.line_numbers = [] self.style_dict = {} self.sonderzeichen_list = [] self.svg_file = None self.pdfFile = None self.source = None self.number = int(page_number) if page_number is not None else -1 if xml_source_file is not None: if isfile(xml_source_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_source_file, parser) self.title = self.page_tree.getroot().get('title') self.number = self.page_tree.getroot().get('number') self.source = self.page_tree.getroot().get('source') self.init_words() self.add_style(style_node=self.page_tree.getroot().find('.//style')) self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\ if len(self.page_tree.xpath('.//svg/@file')) > 0 else None self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None if pdfFile is not None and self.pdfFile is None: self.pdfFile = pdfFile ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) #write_pretty(xml_element_tree=self.page_tree, file_name=xml_source_file, script_name=__file__, file_type='svgWordPosition') if svg_file is not None and self.svg_file is None: self.svg_file = svg_file tf = TranskriptionField(svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) #write_pretty(xml_element_tree=self.page_tree, file_name=xml_source_file, script_name=__file__, file_type='svgWordPosition') else: raise Exception('File "{}" does not exist!'.format(xml_source_file)) elif xml_target_file is not None: self.word_insertion_marks = [] self.words = [] self.svg_file = svg_file self.pdfFile = pdfFile if isfile(xml_target_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_target_file, parser) self.source = self.page_tree.getroot().get('source') if bool(self.page_tree.getroot().get('title')): self.title = self.page_tree.getroot().get('title') elif title is not None: self.page_tree.getroot().set('title', title) if self.svg_file is None: self.svg_file = self.page_tree.xpath('.//svg/@file')[0]\ if len(self.page_tree.xpath('.//svg/@file')) > 0 else None self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 elif len(self.page_tree.xpath('.//svg/@file')) == 0: tf = TranskriptionField(svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) else: self.width = float(self.page_tree.xpath('.//svg/@width')[0])\ if len(self.page_tree.xpath('.//svg/@width')) > 0 else 0.0 self.height = float(self.page_tree.xpath('.//svg/@height')[0])\ if len(self.page_tree.xpath('.//svg/@height')) > 0 else 0.0 if self.pdfFile is None: self.pdfFile = self.page_tree.xpath('.//pdf/@file')[0]\ if len(self.page_tree.xpath('.//pdf/@file')) > 0 else None elif len(self.page_tree.xpath('.//pdf/@file')) == 0: ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) for xpath2remove in [ 'word', 'style', 'freehand', LineNumber.XML_TAG() ]: for node in self.page_tree.xpath('//' + xpath2remove): node.getparent().remove(node) else: self.page_tree = ET.ElementTree(ET.Element('page')) self.pdfFile = pdfFile self.svg_file = svg_file if title is not None: self.page_tree.getroot().set('title', title) self.page_tree.getroot().set('transkription-field-only', str(extract_transkription_field_only).lower()) if page_number is not None: self.page_tree.getroot().set('number', str(page_number)) if self.pdfFile is not None: ET.SubElement(self.page_tree.getroot(), 'pdf', attrib={'file': self.pdfFile}) if self.svg_file is not None: tf = TranskriptionField(self.svg_file) self.width = round(tf.documentWidth, 3) self.height = round(tf.documentHeight, 3) ET.SubElement(self.page_tree.getroot(), 'svg', attrib={'width': str(self.width), 'height': str(self.height), 'file': self.svg_file}) self.svg_image = Image(file_name=self.svg_file, height=self.height, width=self.width) if self.svg_file is not None\ else None - self.create_semantic_dictionary(Page.class_dictionary) def init_line_numbers(self, line_numbers, document_bottom): """Init line numbers. """ even_index = 0 MINABOVE = 1 self.line_numbers = [] if len(line_numbers) > 0: first_line_bottom = line_numbers[even_index].top - MINABOVE self.line_numbers.append(LineNumber(id=1, top=0, bottom=first_line_bottom)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 while even_index < len(line_numbers): self.line_numbers.append(LineNumber(id=line_numbers[even_index].id-1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=line_numbers[even_index].top-MINABOVE)) self.line_numbers.append(line_numbers[even_index]) even_index += 1 self.line_numbers.append(LineNumber(id=line_numbers[even_index-1].id+1,\ top=line_numbers[even_index-1].bottom+MINABOVE,\ bottom=document_bottom)) for line_number in self.line_numbers: line_number.attach_object_to_tree(self.page_tree) def init_words(self): self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG()) ] self.words = [ Word.CREATE_WORD(word_node=word_node) for word_node in self.page_tree.getroot().xpath('//word') ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG()) ] for index, word in enumerate(self.words): for word_insertion_mark in self.word_insertion_marks: self.words[index] = word_insertion_mark.attach_and_update_word_if_involved(word) if self.words[index] != word: break def add_style(self, sonderzeichen_list=[], letterspacing_list=[], style_dict={}, style_node=None): """Adds a list of classes that are sonderzeichen and a style dictionary to page. """ self.sonderzeichen_list = sonderzeichen_list self.letterspacing_list = letterspacing_list self.style_dict = style_dict if style_node is not None: self.style_dict = { item.get('name'): { key: value for key, value in item.attrib.items() if key != 'name' } for item in style_node.findall('.//class') } self.sonderzeichen_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('font-family')) and 'Sonderzeichen' in item.get('font-family') ] self.letterspacing_list = [ item.get('name') for item in style_node.findall('.//class')\ if bool(item.get('letterspacing-list')) ] elif bool(self.style_dict): style_node = ET.SubElement(self.page_tree.getroot(), 'style') if len(self.sonderzeichen_list) > 0: style_node.set('Sonderzeichen', ' '.join(self.sonderzeichen_list)) if len(self.letterspacing_list) > 0: style_node.set('letterspacing-list', ' '.join(self.letterspacing_list)) for key in self.style_dict.keys(): self.style_dict[key]['name'] = key ET.SubElement(style_node, 'class', attrib=self.style_dict[key]) def get_biggest_fontSize4styles(self, style_set={}): """Returns biggest font size from style_dict for a set of style class names. [:returns:] (float) biggest font size OR 1 if style_dict is empty """ if bool(self.style_dict): sorted_font_sizes = sorted( (float(self.style_dict[key]['font-size'].replace('px','')) for key in style_set if bool(self.style_dict[key].get('font-size'))), reverse=True) return sorted_font_sizes[0] if len(sorted_font_sizes) > 0 else 1 else: return 1 def get_line_number(self, y): """Returns line number id for element at y. [:return:] (int) line number id or -1 """ if len(self.line_numbers) > 0: result_list = [ line_number.id for line_number in self.line_numbers if y >= line_number.top and y <= line_number.bottom ] return result_list[0] if len(result_list) > 0 else -1 else: return -1 - def create_semantic_dictionary(self, dictionary): + @classmethod + def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ - if len(dictionary) == 0: - class_dict = self.get_class_dictionary() - if self.__class__ == Page: - class_dict.update({'rdf:subClassOf': 'http://www.knora.org/ontology/0000/information-carrier#Page'}) - properties = {'title': (str, 1), 'number': (str, 1), 'line_numbers': (LineNumber, SemanticClass.LIST), 'words': (Word, SemanticClass.LIST),\ - 'svg_image': (Image, 1), 'word_insertion_marks': (WordInsertionMark, SemanticClass.LIST)} - dictionary.update({'class': class_dict}) - dictionary.update({'properties': properties}) - - @staticmethod - def get_semantic_dictionary(): - return __class__.class_dictionary + dictionary = {} + class_dict = cls.get_class_dictionary() + properties = {'title': (str, 1), 'number': (str, 1), 'line_numbers': (LineNumber, SemanticClass.LIST), 'words': (Word, SemanticClass.LIST),\ + 'svg_image': (Image, 1), 'word_insertion_marks': (WordInsertionMark, SemanticClass.LIST)} + dictionary.update({'class': class_dict}) + dictionary.update({'properties': properties}) + return dictionary + +