Index: svgscripts/extractWordPosition.py =================================================================== --- svgscripts/extractWordPosition.py (revision 110) +++ svgscripts/extractWordPosition.py (revision 111) @@ -1,562 +1,710 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to extract the position of the words in a svg file and write them to a xml file. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} import inspect import getopt from lxml import etree as ET from os import sep, listdir, mkdir, path from os.path import exists, isfile, isdir from progress.bar import Bar import re import sys import warnings from datatypes.lineNumber import LineNumber from datatypes.matrix import Matrix from datatypes.page_creator import PageCreator, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from datatypes.pdf import PDFText +from datatypes.positional_word_part import PositionalWordPart from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.word import Word from datatypes.word_insertion_mark import WordInsertionMark from util import process_warnings4status, reset_tp_with_matrix sys.path.append('shared_util') from myxmlwriter import write_pretty __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" class Extractor: """ This class can be used to extract the word positions in a svg file and write it to a xml file. Args: [xml_dir (str): target directory] [title (str): title of document] [manuscript_file (str): xml file containing information about the archival unity to which the current page belongs """ UNITTESTING = False SONDERZEICHEN_LIST = [ 'A', 'B', '{', '}' ] SET_POSITIONS_TO_TEXTFIELD_0_0 = False def __init__(self, xml_dir=None, title=None, manuscript_file=None, compare2pdf=False): if bool(xml_dir): self.xml_dir = xml_dir not isdir(self.xml_dir) and mkdir(self.xml_dir) else: self.xml_dir = 'xml' if(isdir('xml')) else '' self.latest_status = None self.compare2pdf = compare2pdf self.xml_dir = self.xml_dir + sep if(bool(self.xml_dir)) else '' self.title = title self.manuscript_file = manuscript_file self.manuscript_tree = None self.svg_tree = None if not bool(self.title) and bool(self.manuscript_file) and isfile(self.manuscript_file): self.manuscript_tree = ET.parse(self.manuscript_file) self.title = self.manuscript_tree.getroot().get('title') elif bool(self.manuscript_file): raise FileNotFoundError('File "{}" does not exist!'.format(self.manuscript_file)) elif bool(self.title): self.update_title_and_manuscript(self.title, False) + def _get_pwps_break_points(self, page, pwps) ->list: + """Return a list of break points from word_part_objs. + """ + break_points = [] + if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points + for Sonderzeichen in self.SONDERZEICHEN_LIST: + sonderzeichen_pwps = [ pwp for pwp in pwps if pwp.text == Sonderzeichen and any(sz in pwp.style_class for sz in page.sonderzeichen_list) ] + if len(sonderzeichen_pwps) > 0: + break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(sonderzeichen_pwps) ]] + for pwp in sonderzeichen_pwps: + wim_index = len(page.word_insertion_marks) + wim = WordInsertionMark(id=wim_index, x=pwp.left, y=pwp.top-pwp.height, height=pwp.height, width=pwp.width,\ + line_number=page.get_line_number(pwp.top-pwp.height-1), mark_type=Sonderzeichen) + page.word_insertion_marks.append(wim) + if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_pwps(pwps)))): # case: digits from line number and chars from words -> create break points + THRESHOLDX = 20 # Threshold between line number and text + last_x = -1 + for i, x in enumerate([float(pwp.left) for pwp in pwps]): + if(last_x > -1 and (x - last_x > THRESHOLDX)): + break_points.append((i, i)) + last_x = x + return break_points + def _get_break_points(self, page, word_part_objs, transkription_field=None) ->list: """Return a list of break points from word_part_objs. """ break_points = [] if(len(page.sonderzeichen_list) > 0): # check for Sonderzeichen and special chars -> mark for word insertion, create break points for Sonderzeichen in self.SONDERZEICHEN_LIST: contains_Sonderzeichen = [ dict['text'] == Sonderzeichen and any(sz in dict['class'] for sz in page.sonderzeichen_list) for dict in word_part_objs ] if True in contains_Sonderzeichen: break_points += [ (endPoint, endPoint + 1) for endPoint in [i for i, e in enumerate(contains_Sonderzeichen) if e == True ]] for sz_point in [i for i, e in break_points]: wim_index = len(page.word_insertion_marks) x = float(word_part_objs[sz_point]['x']) y = float(word_part_objs[sz_point]['y']) if page.svg_file is not None and isfile(page.svg_file)\ and (not self.SET_POSITIONS_TO_TEXTFIELD_0_0 or transkription_field is not None): svg_path_tree = ET.parse(page.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } xmin = 0 if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field.xmin ymin = 0 if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field.ymin wim = WordInsertionMark.CREATE_WORD_INSERTION_MARK(svg_path_tree, namespaces, id=wim_index, x=x, y=y, xmin=xmin, ymin=ymin,\ line_number=page.get_line_number(y-1), mark_type=Sonderzeichen) page.word_insertion_marks.append(wim) if(bool(re.search(r'\d[A-Za-z]', self.get_word_from_part_obj(word_part_objs)))): # case: digits from line number and chars from words -> create break points THRESHOLDX = 20 # Threshold between line number and text last_x = -1 for i, x in enumerate([float(dict['x']) for dict in word_part_objs]): if(last_x > -1 and (x - last_x > THRESHOLDX)): break_points.append((i, i)) last_x = x return break_points + def _process_pwps_break_points(self, break_points, page, index, pwps) ->int: + """Process break points on pwps and return new index. + """ + from_index = 0 + debug_msg = 'process break points' + for end_point, next_from_index in break_points: + new_pwps = pwps[from_index:end_point] + from_index = next_from_index + index = self.create_word_from_pwps(page, index, new_pwps, debug_msg=debug_msg) + if from_index > 0 and from_index < len(pwps): + new_pwps = pwps[from_index:] + index = self.create_word_from_pwps(page, index, new_pwps, debug_msg=debug_msg + ' ... end point') + if len(page.words) > 1\ + and re.match(r'[^\w\s]', page.words[-1].text): + last_word = page.words.pop() + page.words[-1].join(last_word) + return last_word.id + return index + def _process_break_points(self, break_points, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None) ->int: """Process break points on word_part_objs and return new index. """ from_index = 0 for end_point, next_from_index in break_points: new_word_part_objs = word_part_objs[from_index:end_point] new_endX = word_part_objs[end_point]['x'] from_index = next_from_index index = self.add_word(page, index, new_word_part_objs, None, new_endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) if from_index > 0 and from_index < len(word_part_objs): new_word_part_objs = word_part_objs[from_index:] index = self.add_word(page, index, new_word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) return index def add_word(self, page, index, word_part_objs, endSign, endX, matrix=None, debug_msg=None, transkription_field=None) ->int: """Creates transkription_positions and a new word from word_part_objs (i.e. a list of dictionaries about parts of this word). If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, word_part_objs will be split and several words are created. :returns: the new word counter (int) """ break_points = self._get_break_points(page, word_part_objs, transkription_field=transkription_field) if(len(break_points) > 0): # if there are break points -> split word_part_obj and add the corresponding words return self._process_break_points(break_points, page, index, word_part_objs, endSign, endX, matrix=matrix, debug_msg=debug_msg, transkription_field=transkription_field) else: if len(word_part_objs) > 0: provide_tf = None if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=matrix,\ debug_msg_string=debug_msg, transkription_field=provide_tf, svg_path_tree=self.svg_tree) text = self.get_word_from_part_obj(word_part_objs) line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2) if line_number == -1: if transkription_positions[0].transform is not None: line_number = page.get_line_number(transkription_positions[0].transform.getY()) if line_number == -1 and len(page.words) > 0: lastWord = page.words[-1] lastWord_lastTP = lastWord.transkription_positions[-1] lastTP = transkription_positions[-1] if transkription_positions[0].left > lastWord_lastTP.left\ and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2: line_number = lastWord.line_number else: line_number = lastWord.line_number+1 #reset_tp_with_matrix(transkription_positions) newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions) page.words.append(newWord) return int(index) + 1 else: return int(index) + def create_word_from_pwps(self, page, index, pwps, debug_msg=None) ->int: + """Creates transkription_positions and a new word from pwps (i.e. a list of PositionalWordPart). + If word contains a Sonderzeichen as specified by self.SONDERZEICHEN_LIST, pwps will be split and several words are created. + + :returns: the new word counter (int) + """ + break_points = self._get_pwps_break_points(page, pwps) + if(len(break_points) > 0): # if there are break points -> split pwps and add the corresponding words + return self._process_pwps_break_points(break_points, page, index, pwps) + else: + if len(pwps) > 0: + transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(pwps, debug_msg_string=debug_msg) + text = self.get_word_from_pwps(pwps) + line_number = page.get_line_number((transkription_positions[0].bottom+transkription_positions[0].top)/2) + if line_number == -1: + if transkription_positions[0].transform is not None: + line_number = page.get_line_number(transkription_positions[0].transform.getY()) + if line_number == -1 and len(page.words) > 0: + lastWord = page.words[-1] + lastWord_lastTP = lastWord.transkription_positions[-1] + lastTP = transkription_positions[-1] + if transkription_positions[0].left > lastWord_lastTP.left\ + and abs(lastWord_lastTP.bottom-lastTP.bottom) < lastTP.height/2: + line_number = lastWord.line_number + else: + line_number = lastWord.line_number+1 + #reset_tp_with_matrix(transkription_positions) + newWord = Word(id=index, text=text, line_number=line_number, transkription_positions=transkription_positions) + page.words.append(newWord) + return int(index) + 1 + else: + return int(index) + def extractAndWriteInformation(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, record_warnings=False, warning_filter='default', multipage_index=-1, marginals_page=None): """Extracts information about positions of text elements and writes them to a xml file. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file exit_status = 0 with warnings.catch_warnings(record=record_warnings) as w: warnings.simplefilter(warning_filter) page = self.extract_information(file_name, page_number=page_number, xml_target_file=xml_target_file, svg_file=svg_file, pdfFile=pdfFile,\ multipage_index=multipage_index, marginals_page=marginals_page) status_message = process_warnings4status(w, [ PageCreator.WARNING_MISSING_USE_NODE4PWP, PageCreator.WARNING_MISSING_GLYPH_ID4WIM ],\ '', 'OK', 'with warnings') if status_message != 'OK': self.latest_status = status_message exit_status = 1 else: self.latest_status = None page.page_tree.getroot().set('status', status_message) write_pretty(xml_element_tree=page.page_tree, file_name=xml_target_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) return exit_status else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) def extract_information(self, file_name, page_number=None, xml_target_file=None, svg_file=None, pdfFile=None, multipage_index=-1, marginals_page=None) -> PageCreator: """Extracts information about positions of text elements. """ if isfile(file_name): if not bool(xml_target_file): xml_target_file = self.get_file_name(file_name, page_number) if bool(self.xml_dir) and not bool(path.dirname(xml_target_file)): xml_target_file = path.dirname(self.xml_dir) + sep + xml_target_file transkription_field = TranskriptionField(file_name, multipage_index=multipage_index) text_field = transkription_field.convert_to_text_field() self.svg_tree = ET.parse(file_name) page = PageCreator(xml_target_file, title=self.title, multipage_index=multipage_index,\ page_number=page_number, pdfFile=pdfFile, svg_file=svg_file,\ svg_text_field=text_field, source=file_name, marginals_source=marginals_page) sonderzeichen_list, letterspacing_list, style_dict = self.get_style(self.svg_tree.getroot()) page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict) page.init_line_numbers(LineNumber.extract_line_numbers(self.svg_tree, transkription_field, set_to_text_field_zero=self.SET_POSITIONS_TO_TEXTFIELD_0_0),\ transkription_field.ymax) - self.extract_word_position(self.svg_tree, page, transkription_field=transkription_field) + self.improved_extract_word_position(self.svg_tree, page, transkription_field=transkription_field) page.create_writing_processes_and_attach2tree() page.update_and_attach_words2tree() for word_insertion_mark in page.word_insertion_marks: # it is not clear if we really need to know this alternative word ordering. See 'TODO.md' #word_insertion_mark.inserted_words = self.find_inserted_words(page.page_tree, word_insertion_mark) word_insertion_mark.attach_object_to_tree(page.page_tree) return page else: raise FileNotFoundError('\"{}\" is not an existing file!'.format(file_name)) + def improved_extract_word_position(self, svg_tree, page, transkription_field=None): + """Extracts word positions. + """ + if page.svg_file is None or not isfile(page.svg_file): + warnings.warn('There is no page.svg_file or it does not exist ... using old function "extract_word_position"!') + self.extract_word_position(svg_tree, page, transkription_field=transkription_field) + else: + svg_path_tree = ET.parse(page.svg_file) + namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } + counter = 0 + word_part_obj = [] + pwps = [] + endSign = '%' + last_matrix = None + MAXBOTTOMDIFF = 5 + MAXXDIFF = 11 + INTERCHARSPACE = 1.1 + if not Extractor.UNITTESTING: + bar = Bar('(improved) extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)])) + for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field): + current_matrix = Matrix(text_item.get('transform')) + # check for line breaks + if last_matrix is not None and len(pwps) > 0 and (\ + (current_matrix.getX() > pwps[-1].left+pwps[-1].width + INTERCHARSPACE or last_matrix.getX()-current_matrix.getX() > MAXXDIFF) or\ + (abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF)): + endSign = '%' + if(self.get_word_from_pwps(pwps) != ''): + debug_msg = 'check for line breaks, diffx: {}, diffy: {}, current_matrix: {}, last_matrix: {}'.format(\ + round(current_matrix.getX() - (pwps[-1].left+pwps[-1].width), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\ + current_matrix.toString(), last_matrix.toString()) + counter = self.create_word_from_pwps(page, counter, pwps, debug_msg=debug_msg) + pwps = [] + endX = current_matrix.getX() + if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT + if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))): + x = current_matrix.getX() if not current_matrix.isRotationMatrix() else 0.0 + y = current_matrix.getY() if not current_matrix.isRotationMatrix() else 0.0 + pwps += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST({ "text": text_item.text, "x": x, "y": y, "class": text_item.get('class'), "matrix": current_matrix},\ + svg_path_tree, namespaces, page=page) + else: + if(self.get_word_from_pwps(pwps) != ''): + counter = self.create_word_from_pwps(page, counter, pwps, debug_msg="text: next string empty") + pwps = [] + for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT + endX = current_matrix.add2X(tspan_item.get('x')) + if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))): + y = current_matrix.add2Y(tspan_item.get('y')) + pwps += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST({ "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'),\ + "matrix": current_matrix }, svg_path_tree, namespaces, page=page) + if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0: + """text_item has letterspacing class + (set s & set t = new set with elements common to s and t) + """ + if(self.get_word_from_pwps(pwps) != ''): + counter = self.create_word_from_pwps(page, counter, pwps, debug_msg="letterspacing class") + pwps = [] + else: + if(self.get_word_from_pwps(pwps) != ''): + counter = self.create_word_from_pwps(page, counter, pwps, debug_msg="tspan: next string empty") + pwps = [] + last_matrix = current_matrix + not bool(Extractor.UNITTESTING) and bar.next() + if(self.get_word_from_pwps(pwps) != ''): + counter = self.create_word_from_pwps(page, counter, pwps, debug_msg='end of loop') + pwps = [] + not bool(Extractor.UNITTESTING) and bar.finish() + def extract_word_position(self, svg_tree, page, transkription_field=None): """Extracts word positions. """ counter = 0 word_part_obj = [] endSign = '%' last_matrix = None MAXBOTTOMDIFF = 5 MAXXDIFF = 6 if not Extractor.UNITTESTING: bar = Bar('extracting word positions from text_item', max=len([*self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field)])) for text_item in self.get_text_items(svg_tree.getroot(), transkription_field=transkription_field): provide_tf = None if not self.SET_POSITIONS_TO_TEXTFIELD_0_0 else transkription_field current_matrix = Matrix(text_item.get('transform'), transkription_field=provide_tf) # check for line breaks if (last_matrix is not None and len(word_part_obj) > 0 and (\ Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix) or\ (abs(current_matrix.getY() - last_matrix.getY()) > MAXBOTTOMDIFF) or\ (abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']) > MAXXDIFF)))\ or (len(word_part_obj) > 0 and self.get_word_object_multi_char_x(word_part_obj[0]) > current_matrix.getX()): endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): debug_msg = 'check for line breaks, diffx: {}, diffy: {}, diff_conversion_matrix: {}'.format(\ round(abs(current_matrix.getX() - word_part_obj[len(word_part_obj)-1]['x']), 3), round(abs(current_matrix.getY() - last_matrix.getY()), 3),\ str(Matrix.DO_CONVERSION_FACTORS_DIFFER(last_matrix, current_matrix))) counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg=debug_msg, transkription_field=transkription_field) word_part_obj = [] endX = current_matrix.getX() if(len(text_item.findall(".//tspan", svg_tree.getroot().nsmap)) < 1): # case: TEXT if(bool(text_item.text) and not bool(re.search(r'^\s*$', text_item.text))): word_part_obj.append( { "text": text_item.text, "x": current_matrix.getX(), "y": current_matrix.getY(), "class": text_item.get('class'), "matrix": current_matrix} ) else: endSign = text_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=last_matrix, debug_msg='svg/text/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' for tspan_item in text_item.findall(".//tspan", svg_tree.getroot().nsmap): # case: TEXT endX = current_matrix.add2X(tspan_item.get('x')) if(tspan_item.text != None and tspan_item.text != '' and not bool(re.search(r'^\s*$', tspan_item.text))): y = current_matrix.add2Y(tspan_item.get('y')) word_part_obj.append( { "text": tspan_item.text, "x": endX, "y": y, "class": tspan_item.get('class'), "matrix": current_matrix }) if len(set(page.letterspacing_list) & set(tspan_item.get('class').split(' '))) > 0: """text_item has letterspacing class (set s & set t = new set with elements common to s and t) """ endSign = '%' if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='tspan with letterspacing', transkription_field=transkription_field) word_part_obj = [] else: endSign = tspan_item.text if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix,\ debug_msg='svg/text/tspan/\s', transkription_field=transkription_field) word_part_obj = [] endSign = '%' last_matrix = current_matrix not bool(Extractor.UNITTESTING) and bar.next() if(self.get_word_from_part_obj(word_part_obj) != ''): counter = self.add_word(page, counter, word_part_obj, endSign, endX, matrix=current_matrix, debug_msg='end of loop',\ transkription_field=transkription_field) word_part_obj = [] endSign = '%' not bool(Extractor.UNITTESTING) and bar.finish() def find_inserted_words_by_position(self, target_tree, x, y): """Returns an Array with the words that are inserted above the x, y position or [] if not found. """ warnings.warn('Function "find_inserted_words_by_position" does not work and it is not clear whether we need this.') MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 if(len(target_tree.getroot().xpath('//word[@id]')) > 0): result_list = [] minus2left = 20.0 minus2top = 19.0 while len(result_list) == 0 and minus2top < MINY and minus2left > DIFFX : result_list = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@top>{0} and @top<{1} and @left>{2} and @left<{3}]'.format(y - minus2top, y - MAXY, x - minus2left, x + DIFFX)) ] minus2left -= 1 minus2top += 1 if len(result_list) > 0: result_bottom = result_list[len(result_list)-1].bottom result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width for item in target_tree.getroot().xpath('//word[@bottom={0} and @left>{1}]'.format(result_bottom, result_left_min)): result_left_min = result_list[len(result_list)-1].left + result_list[len(result_list)-1].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def find_inserted_words(self, target_tree, word_insertion_mark): """Returns an Array with the words that are inserted above/underneath the word_insertion_mark. """ warnings.warn('Function "find_inserted_words" does not work and it is not clear whether we need this.') if word_insertion_mark.line_number < 2 or word_insertion_mark.line_number % 2 == 1: return self.find_inserted_words_by_position(target_tree, word_insertion_mark.x, word_insertion_mark.y) if(len(target_tree.getroot().xpath('//word[@id]')) > 0): MINY = 31.0 MAXY = 10.0 DIFFX = 9.0 result_list = [] x = word_insertion_mark.x y = word_insertion_mark.y if word_insertion_mark.mark_type != 'B': # all insertions that are above the current line line_number = word_insertion_mark.line_number - 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: minus2top = 1.0 while len(result_list) == 0 and minus2top < MINY: for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y - minus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break minus2top += 1 elif word_insertion_mark.mark_type == 'B': # B means insertion is underneath the current line line_number = word_insertion_mark.line_number + 1 words_on_line = [ Word.CREATE_WORD(item) for item in target_tree.getroot().xpath(\ '//word[@line-number={0}]'.format(line_number)) ] if len(words_on_line) > 0: plus2top = 1.0 while len(result_list) == 0 and plus2top < MINY : for word in words_on_line: for transkription_position in word.transkription_positions: if transkription_position.top > y + plus2top\ and transkription_position.left > x - DIFFX\ and transkription_position.left < x + DIFFX: result_list.append(word) break plus2top += 1 if len(result_list) > 0: # now, collect more words that are right of already collected words result_bottom = result_list[len(result_list)-1].transkription_positions[0].bottom result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width for item in target_tree.getroot().xpath(\ '//word[@line-number={0} and @bottom>{1} and @bottom<{2} and @left>{3}]'.format(line_number, result_bottom-5, result_bottom+5, result_left_min)): result_left_min = result_list[len(result_list)-1].transkription_positions[0].left\ + result_list[len(result_list)-1].transkription_positions[0].width result_left_max = result_left_min + DIFFX if float(item.get('left')) - result_left_max < DIFFX: result_list.append(Word.CREATE_WORD(item)) else: break return result_list else: return [] def get_file_name(self, file_name, page_number=None): """Returns the file_name of the target xml file. """ dir_name = path.dirname(self.xml_dir) + sep if(bool(self.xml_dir)) else '' if bool(self.title): return dir_name + self.title.replace(' ', '_') + '_page' + self.get_page_number(file_name, page_number=page_number) + '.xml' else: return '{}{}'.format(dir_name, path.basename(file_name).replace('.svg', '.xml')) def get_page_number(self, file_name, page_number=None): """ Returns page number as a string (with leading zero(s) if len(page_number) < 3). """ if not bool(page_number) and bool(re.search(r'\d', file_name)): """if page_number=None and filename contains digits, then split filename into its parts that contain only digits, remove empty strings and return the last part containing only digits. """ page_number = list(filter(lambda x: x != '', re.split(r'\D+', file_name))).pop() if bool(page_number): leading_zeros = '00' if(len(page_number) == 1) else '0' if(len(page_number) == 2) else '' return leading_zeros + str(page_number) else: return '' def get_style(self, etree_root): """Returns the style specification as a dictionary. :returns: sonderzeichen_list: list of keys for classes that are 'Sonderzeichen' style_dict: dictionary: key = class name (str), value = style specification (dictionary) """ style_dict = {} sonderzeichen_list = [] letterspacing_list = [] style = etree_root.find('style', etree_root.nsmap) if style is not None: for style_item in list(filter(lambda x: x != '', style.text.split("\n\t"))): style_key = style_item.split('{')[0].replace('.', '') style_value_dict = { item.split(':')[0]: item.split(':')[1].replace('\'','') \ for item in list(filter(lambda x: x!= '', style_item.split('{')[1].replace('}', '').replace('\n','').split(';')))} style_dict[style_key] = style_value_dict if bool(style_value_dict.get('font-family')) and 'Sonderzeichen' in style_value_dict.get('font-family'): sonderzeichen_list.append(style_key) if bool(style_value_dict.get('letter-spacing')): letterspacing_list.append(style_key) return sonderzeichen_list, letterspacing_list, style_dict def get_text_items(self, tree_root, transkription_field=None): """Returns all text elements with a matrix or (if transkription_field is specified) all text elements that are located inside the transkription field. """ if transkription_field is not None: return filter(lambda x: Matrix.IS_PART_OF_TRANSKRIPTION_FIELD(transkription_field, text_node=x),\ tree_root.iterfind(".//text", tree_root.nsmap)) else: return tree_root.iterfind(".//text", tree_root.nsmap) + def get_word_from_pwps(self, pwps): + """Extracts all 'text' from a list of dicitonaries and concats it to a string. + """ + return ''.join([ pwp.text for pwp in pwps ]) + def get_word_from_part_obj(self, word_part_obj): """Extracts all 'text' from a list of dicitonaries and concats it to a string. """ return ''.join([ dict['text'] for dict in word_part_obj]) def get_word_object_multi_char_x(self, word_part_obj_dict): """Returns the x of the last char of word_part_object. TODO: get real widths from svg_file!!! """ WIDTHFACTOR = 2.6 return word_part_obj_dict['x'] if len(word_part_obj_dict['text']) < 2 else word_part_obj_dict['x'] + len(word_part_obj_dict['text']) * WIDTHFACTOR def update_title_and_manuscript(self, title, update_manuscript=True): """Updates title and manuscript. """ self.title = title if update_manuscript or not bool(self.manuscript_file): self.manuscript_file = self.xml_dir + self.title.replace(' ', '_') + '.xml' if not isfile(self.manuscript_file): self.manuscript_tree = ET.ElementTree(ET.Element('manuscript', attrib={"title": self.title})) write_pretty(xml_element_tree=self.manuscript_tree, file_name=self.manuscript_file, script_name=__file__, file_type='xmlManuscriptFile') def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to extract the position of the words in a svg file and write them to a xml file. svgscripts/extractWordPosition.py [OPTIONS] svg file OR xml target file containing file name of svg file as "/page/@source". directory containing svg files OPTIONS: -h|--help: show help -c|--compare-to-pdf compare words to pdf and autocorrect -d|--xml-dir=xmlDir: target directory for the xml output file(s) -m|--manuscript-file: xml file containing information about the archival order to which the current page(s) belong(s) -p|--page=pageNumber: page number of the current page. For use with _one_ file only. -P|--PDF=pdfFile: pdf file - used for word correction -s|--svg=svgFile: svg web file -t|--title=title: title of the manuscript to which the current page(s) belong(s) -x|--xml-target-file=xmlOutputFile: xml target file :return: exit code (int) """ compare2pdf = True manuscript_file = None page_number = None pdfFile = None svg_file = None title = None xml_target_file = None xml_dir = ".{}xml".format(sep) try: opts, args = getopt.getopt(argv, "hcd:m:t:p:s:x:P:", ["help", "compare-to-pdf", "xml-dir=", "manuscript-file=", "title=", "page=", "svg=", "xml-target-file=", "PDF="]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help') or not args: usage() return 0 elif opt in ('-c', '--compare-to-pdf'): compare2pdf = True elif opt in ('-d', '--xml-dir'): xml_dir = arg elif opt in ('-m', '--manuscript-file'): manuscript_file = arg elif opt in ('-t', '--title'): title = arg elif opt in ('-p', '--page'): page_number = str(arg) elif opt in ('-s', '--svg'): svg_file = arg elif opt in ('-P', '--PDF'): pdfFile = arg elif opt in ('-x', '--xml-target-file'): xml_target_file = str(arg) files_to_process = list() for arg in args: if isfile(arg): files_to_process.append(arg) elif isdir(arg): files_to_process = files_to_process + list(filter(lambda file: '.svg' in file, listdir(arg))) else: print("'{}' does not exist!".format(arg)) return 2 if len(files_to_process) < 1 or args[0].endswith('xml'): if xml_target_file is None: xml_target_file = args[0] if len(args) > 0 else None if xml_target_file is not None and isfile(xml_target_file): target_file_tree = ET.parse(xml_target_file) file_name = target_file_tree.getroot().get('source') title = target_file_tree.getroot().get('title') if title is None else title page_number = target_file_tree.getroot().get('number') if page_number is None else page_number if svg_file is None: if len(target_file_tree.xpath('//svg-image')) > 0: svg_file = target_file_tree.xpath('.//svg-image/@file-name')[0]\ if len(target_file_tree.xpath('.//svg-image/@file-name')) > 0 else None else: svg_file = target_file_tree.xpath('.//svg/@file')[0]\ if len(target_file_tree.xpath('.//svg/@file')) > 0 else None files_to_process.insert(0, file_name) if xml_target_file in files_to_process: files_to_process.remove(xml_target_file) else: usage() return 2 if len(files_to_process) > 1 and (bool(page_number) or bool(xml_target_file) or bool(pdfFile) or bool(svg_file)): print("ERROR: too many input files: options --PDF, --page, --svg and --xml-target-file presuppose only one input file!") usage() return 2 extractor = Extractor(xml_dir=xml_dir, title=title, manuscript_file=manuscript_file, compare2pdf=compare2pdf) for file in files_to_process: extractor.extractAndWriteInformation(file, page_number=page_number, xml_target_file=xml_target_file, pdfFile=pdfFile, svg_file=svg_file) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/datatypes/clarification.py =================================================================== --- svgscripts/datatypes/clarification.py (revision 110) +++ svgscripts/datatypes/clarification.py (revision 111) @@ -1,51 +1,44 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word clarification. """ # Copyright (C) University of Basel 2020 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import re import sys from .editor_comment import EditorComment from .text import Text class Clarification(EditorComment): """ This class represents a word clarification. """ def __init__(self, text=None): super(Clarification, self).__init__(is_uncertain=False) self.text = text - @classmethod - def get_semantic_dictionary(cls): - """ Creates a semantic dictionary as specified by SemanticClass. - """ - dictionary = super(Clarification,cls).get_semantic_dictionary() - dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('text', Text, name='clarificationHasText', cardinality=1)) - return cls.return_dictionary_after_updating_super_classes(dictionary) Index: svgscripts/datatypes/atypical_writing.py =================================================================== --- svgscripts/datatypes/atypical_writing.py (revision 110) +++ svgscripts/datatypes/atypical_writing.py (revision 111) @@ -1,52 +1,43 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a atpycial writing by the author. """ # Copyright (C) University of Basel 2020 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import re import sys from .editor_comment import EditorComment from .text import Text class AtypicalWriting(EditorComment): """ This class represents a a atpycial writing by the author. """ def __init__(self, text=None): super(AtypicalWriting, self).__init__(is_uncertain=False) self.text = text - - @classmethod - def get_semantic_dictionary(cls): - """ Creates a semantic dictionary as specified by SemanticClass. - """ - dictionary = super(AtypicalWriting,cls).get_semantic_dictionary() - dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('text', Text, name='atypicalWritingHasText')) - return cls.return_dictionary_after_updating_super_classes(dictionary) - Index: svgscripts/datatypes/word.py =================================================================== --- svgscripts/datatypes/word.py (revision 110) +++ svgscripts/datatypes/word.py (revision 111) @@ -1,913 +1,919 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a word. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import copy import inspect from lxml import etree as ET from operator import attrgetter import re import string import sys import warnings from .box import Box from .editor_comment import EditorComment from .matrix import Matrix from .path import Path from .simple_word import SimpleWord from .style import Style from .word_deletion_path import WordDeletionPath from .word_position import WordPosition from .transkription_position import TranskriptionPosition from .writing_process import WritingProcess SINGLE_PUNCTUATION_PATTERN = r"^[{}–]$".format(string.punctuation) def execute_function_on_parts(word_parts, func_name): """Execute function on parts and add those parts instead of original word to word_parts. :return: new word_parts, output from func """ copy_parts = word_parts[:] for word in word_parts: output = eval('word.{0}()'.format(func_name)) if len(word.word_parts) > 0: for part_word in word.word_parts: copy_parts.insert(copy_parts.index(word), part_word) copy_parts.remove(word) word.word_parts = [] return copy_parts, output def update_transkription_position_ids(word): """Update transkription_position' ids according to index. """ word_part_ids = [ wp.id for wp in word.word_parts ] if len(word_part_ids) != len(set(word_part_ids)): for id, wp in enumerate(word.word_parts): wp.id = id for index, transkription_position in enumerate(sorted(word.transkription_positions, key=attrgetter('left'))): transkription_position.id = index transkription_position.has_box = None transkription_position.deleted = False class Word(SimpleWord): """ This class represents a word. """ COPY_PROPERTY_KEY = [ 'line_number', 'deleted', 'writing_process_id' ] APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS = { 'style': 'styles' } DATA = 'debug-data' RDFS_SUBCLASSOF_LIST = ['https://www.e-editiones.ch/ontology/text#HandwrittenText'] XML_TAG = 'word' XML_EARLIER_VERSION = 'earlier-version' XML_OVERWRITES = 'overwrites' XML_CORRECTION_DICT = { 'isClarificationOfWord': 'clarifiesWord',\ 'isDeletionOfWord': 'deletesEarlierPart',\ 'isExtensionOfWord': 'extendsEarlierVersion',\ 'isTransformationOfWord': 'transformsEarlierPart' } def __init__(self, id=0, text='', line_number=-1, deleted=False, transkription_positions=None, faksimile_positions=None, word_part_objs=None, word_parts=None, writing_process_id=-1, earlier_version=None, box_paths=None, styles=None): super(Word,self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) self.corrections = [] self.clean_edited_text = None self.deleted = deleted self.deletion_paths = [] self.deletion_paths_near_word = [] self.debug_container = {} self.debug_msg = None self.earlier_version = earlier_version self.edited_text = None - self.editor_comment = None + #self.editor_comment = None + self.editor_comments = [] self.isClarificationOfWord = None self.isDeletionOfWord = None self.isExtensionOfWord = None self.isTransformationOfWord = None if len(self.text) == 0 and len(''.join([ tp.get_text() for tp in self.transkription_positions if type(tp) == TranskriptionPosition ])) > 0: self.text = ''.join([ tp.get_text() for tp in self.transkription_positions ]) self.overwrites_word = None self.process_flags = [] self.styles = styles\ if styles is not None\ else [] self.verified = None self.writing_process_id = writing_process_id self.writing_processes = [] self.word_insertion_mark = None self.word_box = None self.word_parts = word_parts if word_parts is not None else [] self.word_part_objs = word_part_objs if word_part_objs is not None else [] def add_deletion_paths(self, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Add a word deletion path to word. """ if len(self.word_parts) > 0: for part in self.word_parts: part.add_deletion_paths(deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) elif self.deleted: index = 0 while len(self.deletion_paths) == 0 and index < len(self.transkription_positions): include_pwps = (len(self.transkription_positions[index].positional_word_parts) > 0 and abs(self.transkription_positions[index].left-self.transkription_positions[index].positional_word_parts[0].left) < 10) word_path = Path.create_path_from_transkription_position(self.transkription_positions[index],\ tr_xmin=tr_xmin, tr_ymin=tr_ymin, include_pwps=include_pwps) self.deletion_paths += [ deletion_path for deletion_path in deletion_paths\ if not Path.is_path_contained(self.deletion_paths, deletion_path)\ and deletion_path.do_paths_intersect(word_path) ] index += 1 def attach_word_to_tree(self, target_tree): """Attaches word to tree target_tree. """ word_node = super(Word,self).attach_word_to_tree(target_tree) if self.deleted is not None: word_node.set('deleted', str(self.deleted).lower()) if self.verified is not None: word_node.set('verified', str(self.verified).lower()) if self.edited_text is not None: word_node.set('edited-text', self.edited_text) - if self.editor_comment is not None: - self.editor_comment.attach_object_to_tree(word_node) + #if self.editor_comment is not None: + # self.editor_comment.attach_object_to_tree(word_node) + for editor_comment in self.editor_comments: + editor_comment.attach_object_to_tree(word_node) if self.writing_process_id > -1: word_node.set('writing-process-id', str(self.writing_process_id)) if len(self.process_flags) > 0: word_node.set('process-flags', ' '.join(self.process_flags)) for index, word_part in enumerate(self.word_parts): word_part.id = index word_part.attach_word_to_tree(word_node) if self.earlier_version is not None: earlier_node = ET.SubElement(word_node, self.XML_EARLIER_VERSION) self.earlier_version.attach_word_to_tree(earlier_node) if self.overwrites_word is not None\ and len(self.overwrites_word.transkription_positions) > 0: overwrite_node = ET.SubElement(word_node, self.XML_OVERWRITES) self.overwrites_word.attach_word_to_tree(overwrite_node) if self.word_box is not None: self.word_box.attach_object_to_tree(word_node) if len(self.corrections) > 0: word_node.set('corrections', ' '.join(set([ str(word.id) for word in self.corrections ]))) for deletion_id, deletion_path in enumerate(self.deletion_paths): deletion_path.id = deletion_id deletion_path.tag = WordDeletionPath.XML_TAG deletion_path.attach_object_to_tree(word_node) for key in self.XML_CORRECTION_DICT.keys(): if self.__dict__[key] is not None: word_node.set(self.XML_CORRECTION_DICT[key], 'true') return word_node def belongs_to_multiple_writing_processes(self, include_parts=False): """Returns true if transkription_positions belong to different WritingProcesses. """ if len(self.word_parts) > 0 and include_parts: return len(set(word.writing_process_id for word in self.word_parts)) > 1 return len(set(tp.writing_process_id for tp in self.transkription_positions )) > 1 def set_parent_word_writing_process_id(self): """Set writing_process_id for parent word. """ ids = set(word.transkription_positions[0].style for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None) if len(ids) > 1: self.writing_process_id = max([style.writing_process_id for style in ids]) if len(set(word.transkription_positions[0].style.create_a_copy_wo_writing_process_id()\ for word in self.word_parts\ if len(word.transkription_positions) > 0 and word.transkription_positions[0].style is not None))\ > 1: self.writing_process_id += 1 @classmethod def create_cls(cls, word_node): """Creates a word from a (lxml.Element) node. [:return:] Word """ cls = super(Word,cls).create_cls(word_node) cls.writing_process_id = int(word_node.get('writing-process-id')) if bool(word_node.get('writing-process-id')) else -1 cls.split_strings = None cls.join_string = word_node.get('join') if bool(word_node.get('split')): cls.split_strings = word_node.get('split').split(' ') if ''.join(cls.split_strings) != cls.text: error_msg = 'Error in file {0}: word with id="{1}" has split attributes that do not correspond to its text attribute!\n'.\ format(word_node.getroottree().docinfo.URL, str(cls.id))\ + 'Split attributes: "{0}".\n'.format(' '.join(cls.split_strings))\ + 'Text attribute: "{0}".\n'.format(cls.text) raise Exception(error_msg) cls.verified = word_node.get('verified') == 'true'\ if bool(word_node.get('verified')) else None cls.deleted = word_node.get('deleted') == 'true'\ if bool(word_node.get('deleted')) else None cls.edited_text = word_node.get('edited-text') if cls.edited_text is not None: cls.clean_edited_text = cls._create_clean_text(cls.edited_text) + cls.editor_comments = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ] + """ cls.editor_comment = [ EditorComment.create_cls_from_node(node) for node in word_node.xpath('./' + EditorComment.XML_TAG) ][0]\ if len([ node for node in word_node.xpath('./' + EditorComment.XML_TAG) ]) > 0 else None + """ cls.word_parts = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_TAG) ] if bool(word_node.get('corrections')): for index in [ int(i) for i in word_node.get('corrections').split(' ') ]: if index < len(cls.word_parts): cls.corrections.append(cls.word_parts[index]) cls.earlier_version = None if len(word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG)) > 0: cls.earlier_version = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_EARLIER_VERSION + '/' + cls.XML_TAG) ][0] for key_value in cls.XML_CORRECTION_DICT.values(): if word_node.get(key_value) == 'true': cls.__dict__[key_value] = True if cls.earlier_version is not None: for word_part in cls.word_parts: for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Part') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]\ and len(cls.word_parts) <= len(cls.earlier_version.word_parts): try: word_part.__dict__[key] = cls.earlier_version.word_parts[word_part.id] except Exception: msg = f'{cls.id} {cls.text}: {word_part.id}' raise Exception(msg) for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('EarlierVersion') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls.earlier_version for key in [ key for key, value in cls.XML_CORRECTION_DICT.items() if value.endswith('Word') ]: if cls.XML_CORRECTION_DICT[key] in word_part.__dict__.keys() and word_part.__dict__[cls.XML_CORRECTION_DICT[key]]: word_part.__dict__[key] = cls cls.overwrites_word = [ cls.create_cls(node) for node in word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)][0]\ if len(word_node.xpath('./' + cls.XML_OVERWRITES + '/' + cls.XML_TAG)) > 0\ else None cls.word_box = [ Box(node=node) for node in word_node.xpath('./' + Box.XML_TAG) ][0]\ if len(word_node.xpath('./' + Box.XML_TAG)) > 0\ else None cls.deletion_paths = [ Path(node=node) for node in word_node.xpath(f'./{WordDeletionPath.XML_TAG}') ] cls.process_flags = word_node.get('process-flags').split(' ')\ if bool(word_node.get('process-flags'))\ else [] return cls @classmethod def join_words(cls, list_of_words, add_white_space_between_words=False): """Creates a word from a list of words. [:return:] Word """ if len(list_of_words) > 1: deleted = True in [ word.deleted for word in list_of_words ]\ and len(set([ word.deleted for word in list_of_words ])) == 1 line_number = list_of_words[0].line_number\ if len(set([ word.line_number for word in list_of_words ])) == 1\ else -1 faksimile_positions = [] for word in list_of_words: if len(word.word_parts) > 0: faksimile_positions += word.faksimile_positions index = list_of_words.index(word) list_of_words.remove(word) for part_word in reversed(word.word_parts): list_of_words.insert(index, part_word) new_word_text = ''.join([word.text for word in list_of_words])\ if not add_white_space_between_words\ else ' '.join([word.text for word in list_of_words]) new_word = cls(id=list_of_words[0].id, text=new_word_text, faksimile_positions=faksimile_positions,\ line_number=line_number, deleted=deleted, word_parts=list_of_words) if True in [ word.text.endswith('-') or word.text.endswith('=') for word in new_word.word_parts[:-1]]: change_text = [ word.text for word in new_word.word_parts[:-1] if word.text.endswith('-') or word.text.endswith('=') ][0] new_word.edited_text = new_word.text.replace(change_text, change_text[:-1]) for id, word in enumerate(new_word.word_parts): word.id = id return new_word if len(list_of_words) > 0: return list_of_words[0] else: return None def create_earlier_version(self, root_word=None, id=0): """Create an earlier version of word. """ if root_word is None: root_word = self root_word.set_parent_word_writing_process_id() word_parts = [] non_single_punctuation_word_parts = [ word_part for word_part in self.word_parts\ if not re.match(SINGLE_PUNCTUATION_PATTERN, word_part.text) ] non_single_punctuation_word_parts_length = len(non_single_punctuation_word_parts) if non_single_punctuation_word_parts_length > 0\ and len([ word_part for word_part in non_single_punctuation_word_parts\ if word_part.deleted ])\ == non_single_punctuation_word_parts_length: self.deleted = True for word_part in non_single_punctuation_word_parts: word_part.deleted = False for id, word_part in enumerate(self.word_parts): earlierWordPart = word_part.create_earlier_version(root_word=root_word, id=id) if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) elif word_part.overwrites_word is not None\ and ((len(word_part.transkription_positions) > 0\ and word_part.overwrites_word.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style\ != word_part.overwrites_word.transkription_positions[0].style) or word_part.word_box.earlier_version): word_part.overwrites_word.id = word_part.id word_parts.append(word_part.overwrites_word) word_part.isTransformationOfWord = word_part.overwrites_word #print(f'transform: {self.text}') if word_part not in self.corrections: self.corrections.append(word_part) elif root_word.writing_process_id > -1\ and (len(word_part.transkription_positions) > 0\ and word_part.transkription_positions[0].style is not None\ and word_part.transkription_positions[0].style.writing_process_id\ == root_word.writing_process_id): word_part.extendsEarlierVersion = True #print('extends') if word_part not in self.corrections: self.corrections.append(word_part) else: if word_part.deleted: word_part.isDeletionOfWord = earlierWordPart word_parts.append(earlierWordPart) if word_part not in self.corrections: self.corrections.append(word_part) else: #print(f'default: {self.text}') word_parts.append(earlierWordPart) text = ''.join([ word.text for word in word_parts ])\ if len(word_parts) > 0\ else self.text if len(word_parts) == 1: self.transkription_positions += word_parts[0].transkription_positions self.faksimile_positions += word_parts[0].faksimile_positions word_parts = [] new_transkription_positions = copy.deepcopy(self.transkription_positions) if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None: writing_process_id = self.transkription_positions[0].style.writing_process_id for new_tp in new_transkription_positions: new_tp.style.writing_process_id = writing_process_id return Word(id=id, text=text, transkription_positions=new_transkription_positions,\ faksimile_positions=self.faksimile_positions, line_number=self.line_number,\ word_parts=word_parts) def create_correction_history(self, page=None, box_style=None): """Create correction history. """ if self.word_box is not None: manuscript = self.transkription_positions[0].style.manuscript\ if len(self.transkription_positions) > 0\ and self.transkription_positions[0].style is not None\ else None style = Style() if box_style is not None: style = box_style if page is not None: style = Style.create_cls(page, self.word_box.text_style_class, manuscript=manuscript) for font_key in [ font_key for font_key in self.word_box.text_style_class.split(' ') if font_key in page.fontsizekey2stage_mapping.keys() ]: style.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) transkription_positions = TranskriptionPosition.copy_list_of_cls(self.transkription_positions) for transkription_position in transkription_positions: transkription_position.style = style self.overwrites_word = Word(text=self.word_box.earlier_text, transkription_positions=transkription_positions,\ line_number=self.line_number) for word_part in self.word_parts: word_part.create_correction_history(page=page, box_style=box_style) if len(self.word_parts) > 0: earlier_version = self.create_earlier_version() extending_words = self._get_parts_with_property_key('extendsEarlierVersion') if len(extending_words) > 0: for word in extending_words: word.isExtensionOfWord = earlier_version if self.has_mixed_status('deleted', include_parts=True): self.edited_text = ''.join([ word.text for word in self.word_parts if not word.deleted ]) if len(self.corrections) > 0: self.earlier_version = earlier_version @staticmethod def CREATE_WORD(word_node=None, page=None, word_part_objs=[], id=0, height=0, endX=0, endSign=None, matrix=None, line_number=-1, debug_msg=None): """Creates a word from a (lxml.Element) node or word_part_objs. [:return:] Word """ if word_node is not None: # init word from xml node id = int(word_node.get('id')) line_number = int(word_node.get('line-number')) if bool(word_node.get('line-number')) else line_number text = word_node.get('text') deleted = bool(word_node.get('deleted')) and word_node.get('deleted') == 'true' transkription_positions = [ TranskriptionPosition(node=node) for node in word_node.findall('.//' + WordPosition.TRANSKRIPTION) ] faksimile_positions = [ WordPosition(node=node) for node in word_node.findall('.//' + WordPosition.FAKSIMILE) ] word_part_objs = [ item.attrib for item in word_node.findall('.//' + Word.DATA + '/part')]\ if len(word_node.findall('.//' + Word.DATA)) > 0\ else [ item.attrib for item in word_node.findall('.//part')] return Word(id=id, text=text, deleted=deleted, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions, word_part_objs=word_part_objs) elif len(word_part_objs) > 0: # init word from word_part_obj that has been extracted from svg file WIDTH = 5 TOPCORRECTION = 2.0 FONTWIDTHFACTOR = 0.7 # factor that multiplies lastCharFontSize height = height x = round(float(word_part_objs[0]['x']), 3) if(page is not None and bool(page.style_dict)): HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height style_set = set(' '.join(set( dict['class'] for dict in word_part_objs)).split(' ')) biggest_font_size = page.get_biggest_fontSize4styles(style_set=style_set) height = round(biggest_font_size * HEIGHT_FACTOR + HEIGHT_FACTOR / biggest_font_size, 3) TOPCORRECTION = 1 + HEIGHT_FACTOR / biggest_font_size if endSign is not None and '%' in endSign: lastCharFontSizeList = [ float(page.style_dict[key]['font-size'].replace('px',''))\ for key in word_part_objs[len(word_part_objs)-1]['class'].split(' ')\ if bool(page.style_dict[key].get('font-size'))] lastCharFontSize = lastCharFontSizeList[0] if len(lastCharFontSizeList) > 0 else 1 endX = float(endX) + lastCharFontSize * FONTWIDTHFACTOR elif endSign is not None and '%' in endSign: endX = float(endX) + WIDTH bottom = round(float(word_part_objs[0]['y']), 3) y = round(bottom - height + TOPCORRECTION, 3) width = round(float(endX) - x, 3) transkription_positions = [ WordPosition(height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) ] text = ''.join([ dict['text'] for dict in word_part_objs]) line_number = page.get_line_number( (y + bottom)/2) if page is not None else line_number word = Word(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions, word_part_objs=word_part_objs) word.debug_msg = debug_msg return word else: error_msg = 'word_node has not been defined' if (word_node is None) else 'word_part_objs is empty' raise Exception('Error: {}'.format(error_msg)) @classmethod def get_semantic_dictionary(cls): """ Creates and returns a semantic dictionary as specified by SemanticClass. """ dictionary = super(Word,cls).get_semantic_dictionary() dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('styles', Style,\ cardinality=1, cardinality_restriction='minCardinality',\ name='wordHasStyle', label='word has style', comment='Word has an appearance that is characterized by this style.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('corrections', Word,\ name='wordHasCorrection', label='word has corrections', comment='Word has a correction made by the author.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('deletion_paths', WordDeletionPath,\ name='wordIsDeletedByPath', label='word has been deleted with a deletion path',\ comment='Word has been deleted by the author using a deletion path.')) - dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comment', EditorComment,\ + dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('editor_comments', EditorComment,\ name='wordHasEditorComment', label='word has a comment by the editors', comment='Word has been commented by the editors.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('earlier_version', Word,\ name='wordHasEarlierVersion', label='word has an earlier version', comment='There is a earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('edited_text', str,\ name='hasEditedText', label='word has an edited text', comment='Word has a text that is edited automatically by removing deleted parts or hyphens.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('clean_edited_text', str,\ name='hasCleanEditedText', label='word has an edited text without punctuation',\ comment='Word has a text without punctuation that is edited automatically by removing deleted parts or hyphens.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isClarificationOfWord', Word,\ name='isClarificationOfWord', label='word is a clarification of word',\ comment='The author has used this part of the word in order to clarify the appearance of that word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isDeletionOfWord', Word,\ name='isDeletionOfWord', label='word is a deletion of word',\ comment='The author has used this part of a word in order to delete the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isExtensionOfWord', Word,\ name='isExtensionOfWord', label='word is a extension of word',\ comment='The author has used this part of a word in order to extend an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('isTransformationOfWord', Word,\ name='isTransformationOfWord', label='word is a transformation of word',\ comment='The author has used this part of a word in order to transform the corresponding part of an earlier version of this word.')) dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('overwrites_word', Word,\ name='overwritesWord', label='word overwrites word',\ comment='The author has used this word in order to overwrite that word.')) # This makes wordHasWordParts a subproperty of cls.HAS_HOMOTYPIC_PARTS_URL_STRING, # cls.return_dictionary_after_updating_super_classes will subclass Word under the corresponding super class. dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('word_parts', list,\ name='wordHasWordParts', label='word has word parts', comment='Word consists of a list of words.',\ subPropertyOf=cls.HAS_HOMOTYPIC_PARTS_URL_STRING)) super_property_dictionary = cls.create_semantic_property_dictionary(cls.SUPER_PROPERTY, Word,\ name='isCorrectionOfWord', label='word is a correction of word',\ comment='The author has used this word in order to correct that word.') for key in cls.XML_CORRECTION_DICT.keys(): correction_dict = dictionary[cls.PROPERTIES_KEY].get(key) correction_dict.update(super_property_dictionary) dictionary[cls.PROPERTIES_KEY].update({key: correction_dict}) return cls.return_dictionary_after_updating_super_classes(dictionary) def has_mixed_status(self, property_key, include_parts=False, concerns_word=True): """Returns true if transkription_positions have mixed status concerning the property_key in their __dict__. """ if False in set(property_key in tp.__dict__.keys() for tp in self.transkription_positions): return False if len(self.word_parts) > 0 and include_parts: if concerns_word: if False in set(property_key in word.__dict__.keys() for word in self.word_parts): return False return len(set(word.__dict__[property_key] for word in self.word_parts)) > 1 else: return len(set(word.transkription_positions[0].__dict__[property_key] for word in self.word_parts\ if len(word.transkription_positions) > 0 and property_key in word.transkription_positions[0].__dict__.keys())) > 1 return len(set(tp.__dict__[property_key] for tp in self.transkription_positions )) > 1 def init_word(self, page): """Initialize word with objects from page. """ super(Word,self).init_word(page) if self.writing_process_id > -1: self.writing_processes += [ wp for wp in page.writing_processes if wp.id == self.writing_process_id ] writing_processes = self.writing_processes for word_part in self.word_parts: word_part.init_word(page) self.lines += word_part.lines self.writing_processes += word_part.writing_processes self.lines = [ line for line in set(self.lines) ] self.writing_processes = [ wp for wp in set(self.writing_processes)] if self.overwrites_word is not None: self.overwrites_word.init_word(page) if self.earlier_version is not None: if self.earlier_version.writing_process_id == -1: self.earlier_version.writing_process_id = self.writing_process_id-1 if self.earlier_version.line_number == -1: self.earlier_version.line_number = self.line_number self.earlier_version.init_word(page) self.deletion_paths = [ page.get_word_deletion_path(path) for path in self.deletion_paths if path.path is not None ] def join(self, other_word, append_at_end_of_new_word=True, add_white_space_between_words=False): """Joins other_word with this word by changing the text of current word and adding other_word.transkription_positions. """ if append_at_end_of_new_word: self.text = self.text + other_word.text\ if not add_white_space_between_words\ else self.text + ' ' + other_word.text for position in other_word.transkription_positions: position.id = str(len(self.transkription_positions)) self.transkription_positions.append(position) for position in other_word.faksimile_positions: position.id = str(len(self.faksimile_positions)) self.faksimile_positions.append(position) else: self.text = other_word.text + self.text index = 0 for position in other_word.transkription_positions: self.transkription_positions.insert(index, position) index += 1 while index < len(self.transkription_positions): self.transkription_positions[index].id = str(index) index += 1 index = 0 for position in other_word.faksimile_positions: self.faksimile_positions.insert(indexposition) index += 1 while index < len(self.faksimile_positions): self.faksimile_positions[index].id = str(index) index += 1 self.simplify_transkription_positions() def partition_according_to_deletion(self): """Partition a word according to its transkription_positions' deletion status ->split word and add partial words as its parts. """ if self.has_mixed_status('deleted'): transkription_positions = [] last_status = None for transkription_position in self.transkription_positions: if transkription_position.deleted != last_status\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) for tp in transkription_positions: newWord.deletion_paths += tp._deletion_paths self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.deleted if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=last_status, writing_process_id=self.writing_process_id) for tp in transkription_positions: newWord.deletion_paths += tp._deletion_paths self.word_parts.append(newWord) self.transkription_positions = [] self.line_number = -1 self.deleted = False elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_deletion') elif not self.deleted\ and len(self.transkription_positions) > 0\ and self.transkription_positions[0].deleted: self.deleted = True for tp in self.transkription_positions: self.deletion_paths += tp._deletion_paths def partition_according_to_writing_process_id(self): """Partition a word according to its transkription_positions' writing_process_ids ->split word and add partial words as its parts. """ if self.belongs_to_multiple_writing_processes(): last_writing_process_id = -1 transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.writing_process_id != last_writing_process_id\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) transkription_positions = [] transkription_positions.append(transkription_position) last_writing_process_id = transkription_position.writing_process_id if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, writing_process_id=last_writing_process_id) self.word_parts.append(newWord) self.transkription_positions = [] elif len(self.word_parts) > 0: self.word_parts, none = execute_function_on_parts(self.word_parts, 'partition_according_to_writing_process_id') if self.belongs_to_multiple_writing_processes(include_parts=True): self.writing_process_id = sorted(set([ word.writing_process_id for word in self.word_parts ]), reverse=True)[0] elif len(self.transkription_positions) > 0: self.writing_process_id = self.transkription_positions[0].writing_process_id def process_boxes(self, box_paths, tr_xmin=0.0, tr_ymin=0.0, previous_word_has_box=False): """Determines whether word is over a word box. """ word_over_box = None if len(self.word_parts) > 0: for word in self.word_parts: current_word = word.process_boxes(box_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin, previous_word_has_box=(word_over_box is not None)) if current_word is not None and current_word.word_box is not None: word_over_box = current_word else: new_tp_dict = {} for index, transkription_position in enumerate(self.transkription_positions): if previous_word_has_box and index == 0: if len(transkription_position.positional_word_parts) > 0: transkription_position.positional_word_parts[0].left += transkription_position.positional_word_parts[0].width/2 #print(f'{self.text}: {transkription_position.positional_word_parts[0].left}') else: transkription_position.left += 1 word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) containing_boxes = [ box_path for box_path in box_paths\ if word_path.is_partially_contained_by(box_path)\ or box_path.do_paths_intersect(word_path) ] if len(containing_boxes) > 0: if previous_word_has_box: print(f'{self.text}: {word_path.path.bbox()} {containing_boxes[0].path.bbox()}') self._set_box_to_transkription_position(containing_boxes[0], word_path,\ transkription_position, new_tp_dict, tr_xmin) box_paths.remove(containing_boxes[0]) for replace_tp in new_tp_dict.keys(): for tp in new_tp_dict.get(replace_tp): self.transkription_positions.insert(self.transkription_positions.index(replace_tp), tp) self.transkription_positions.remove(replace_tp) word_over_box = self._get_partial_word_over_box() update_transkription_position_ids(self) return word_over_box def set_word_insertion_mark(self, word_insertion_mark): """Sets word_insertion_mark """ self.word_insertion_mark = word_insertion_mark def set_writing_process_id_to_transkription_positions(self, page): """Determines the writing process id of the transkription_positions. """ for transkription_position in self.transkription_positions: if len(transkription_position.positional_word_parts) > 0: for font_key in transkription_position.positional_word_parts[0].style_class.split(' '): if font_key in page.fontsizekey2stage_mapping.keys(): transkription_position.writing_process_id = page.fontsizekey2stage_mapping.get(font_key) def simplify_transkription_positions(self): """Merge transkription_positions if possible. """ index = len(self.transkription_positions)-1 while index > 0\ and False not in [ 'positional_word_parts' in tp.__dict__.keys() for tp in self.transkription_positions ]: current_tp = self.transkription_positions[index] index -= 1 previous_tp = self.transkription_positions[index] if previous_tp.is_mergebale_with(current_tp): positional_word_parts = previous_tp.positional_word_parts positional_word_parts += current_tp.positional_word_parts transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ positional_word_parts, debug_msg_string='simplifying transkription positions', transkription_position_id=previous_tp.id) if len(transkription_positions) == 1: transkription_positions[0].writing_process_id = previous_tp.writing_process_id\ if previous_tp.writing_process_id != -1\ else current_tp.writing_process_id self.transkription_positions.pop(index+1) self.transkription_positions[index] = transkription_positions[0] #print(self.text, len(self.transkription_positions)) def split(self, split_string, start_id=0): """Splits the word and returns an 3-tuple of new words. """ previousString, currentString, nextString = self.text.partition(split_string) currentWord = None previousWord = None nextWord = None previousIndex = 0 current_id = start_id all_positional_word_parts = [] for position in self.transkription_positions: all_positional_word_parts += position.positional_word_parts if len(all_positional_word_parts) == 0: warnings.warn('ATTENTION: Word: {} {} with Strings "{}, {}, {}": there are no parts!'.format(self.id, self.text, previousString, currentString, nextString)) if len(previousString) > 0: previous_pwps = [] while previousIndex < len(all_positional_word_parts) and previousString != ''.join([ pwp.text for pwp in previous_pwps ]): previous_pwps.append(all_positional_word_parts[previousIndex]) previousIndex += 1 if previousString != ''.join([ pwp.text for pwp in previous_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(previousString)) else: previous_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(previous_pwps, debug_msg_string='word.split') previous_text = ''.join([ pwp.text for pwp in previous_pwps ]) previousWord = Word(text=previous_text, id=current_id, line_number=self.line_number, transkription_positions=previous_transkription_positions) previousWord.faksimile_positions = self.faksimile_positions current_id += 1 all_positional_word_parts = all_positional_word_parts[previousIndex:] if len(nextString) > 0: tmp_pwps = [] index = 0 while index < len(all_positional_word_parts) and currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): tmp_pwps.append(all_positional_word_parts[index]) index += 1 if currentString != ''.join([ pwp.text for pwp in tmp_pwps ]): warnings.warn('ATTENTION: "{}" does not match a word_part_obj!'.format(currentString)) else: next_pwps = all_positional_word_parts[index:] next_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(next_pwps, debug_msg_string='word.split') next_text = ''.join([ pwp.text for pwp in next_pwps ]) nextWord = Word(text=next_text, id=current_id+1, line_number=self.line_number, transkription_positions=next_transkription_positions) nextWord.faksimile_positions = self.faksimile_positions all_positional_word_parts = all_positional_word_parts[:index] current_transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(all_positional_word_parts, debug_msg_string='word.split') current_text = ''.join([ pwp.text for pwp in all_positional_word_parts ]) currentWord = Word(text=current_text, id=current_id, line_number=self.line_number, transkription_positions=current_transkription_positions) currentWord.faksimile_positions = self.faksimile_positions return previousWord, currentWord, nextWord def split_according_to_status(self, status, splits_are_parts=False): """Split a word according to its transkription_positions' text. :return: a list of new word.Word """ new_words = [] if self.has_mixed_status(status): last_status = None transkription_positions = [] for transkription_position in self.transkription_positions: if transkription_position.__dict__[status] != last_status\ and len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words))) transkription_positions = [] transkription_positions.append(transkription_position) last_status = transkription_position.__dict__[status] if len(transkription_positions) > 0: new_words.append(\ self._create_new_word(transkription_positions, status, new_id=self.id+len(new_words))) if splits_are_parts: self.word_parts += new_words if len(self.word_parts) > 0: self.transkription_positions = [] return new_words def undo_partitioning(self): """Undo partitioning. """ if len(self.word_parts) > 0: for word_part in self.word_parts: word_part.undo_partitioning() if self.text != ''.join([ tp.get_text() for tp in self.transkription_positions ]): self.transkription_positions += word_part.transkription_positions self.earlier_version = None self.edited_text = None self.word_box = None self.word_parts = [] self.corrections = [] self.earlier_versions = [] self.box_paths = [] def _create_new_word(self, transkription_positions, status, new_id=0): """Create a new word from self and transkription_positions. """ newWord = Word(id=new_id, transkription_positions=transkription_positions) for key in self.COPY_PROPERTY_KEY: if key != status and key in self.__dict__.keys(): newWord.__dict__[key] = self.__dict__[key] if status in self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS.keys(): newWord.__dict__[self.APPEND_PROPERTY2LIST_SOURCE_TARGET_KEYS[status]].append(transkription_positions[0].__dict__[status]) else: newWord.__dict__[status] = transkription_positions[0].__dict__[status] return newWord def _get_parts_with_property_key(self, property_key): """Return a list of word_parts with property == property_key. """ word_parts = [] for word_part in self.word_parts: if property_key in word_part.__dict__.keys(): word_parts.append(word_part) else: word_parts += word_part._get_parts_with_property_key(property_key) return word_parts def _get_partial_word_over_box(self): """Partition a word according to its transkription_positions' has_box ->split word and add partial words as its parts. :return: word over box or self """ word_over_box = None if self.has_mixed_status('has_box'): transkription_positions = [] last_word_box = None for transkription_position in self.transkription_positions: if transkription_position.has_box != last_word_box\ and len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box transkription_positions = [] transkription_positions.append(transkription_position) last_word_box = transkription_position.has_box if len(transkription_positions) > 0: newWord = Word(id=len(self.word_parts), line_number=self.line_number,\ transkription_positions=transkription_positions, deleted=self.deleted, writing_process_id=self.writing_process_id) self.word_parts.append(newWord) if last_word_box is not None: word_over_box = newWord word_over_box.word_box = last_word_box self.transkription_positions = [] elif len(self.word_parts) > 0: #self.word_parts, word_over_box = execute_function_on_parts(self.word_parts, inspect.currentframe().f_code.co_name) #'get_partial_word_over_box') for word_part in self.word_parts: if word_over_box is None: word_over_box = word_part._get_partial_word_over_box() else: break elif len([ tp for tp in self.transkription_positions if tp.has_box is not None]) == 1: word_over_box = self word_over_box.word_box = [ tp for tp in self.transkription_positions if tp.has_box is not None][0].has_box return word_over_box def _set_box_to_transkription_position(self, box_path, word_path, transkription_position, new_transkription_positions_dictionary, tr_xmin): """Set box_path to transkription_position that is contained by box_path. Create new transkription_positions by splitting old ones if necessaryand add them to new_transkription_positions_dictionary. """ if box_path.contains_path(word_path): transkription_position.has_box = box_path elif box_path.contains_start_of_path(word_path): split_position = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[0].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path elif box_path.contains_end_of_path(word_path): split_position = box_path.path.bbox()[0] - tr_xmin new_tps = transkription_position.split(split_position) if len(new_tps) == 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path else: # box_path in the middle of word_pathz split_position1 = box_path.path.bbox()[0] - tr_xmin split_position2 = box_path.path.bbox()[1] - tr_xmin new_tps = transkription_position.split(split_position1, split_position2) if len(new_tps) >= 2: new_tps[1].has_box = box_path new_transkription_positions_dictionary.update({ transkription_position: new_tps }) else: transkription_position.has_box = box_path def do_paths_intersect_saveMode(mypath1, mypath2): """Returns true if paths intersect, false if not or if there was an exception. """ try: return mypath1.path.intersect(mypath2.path, justonemode=True)\ or mypath1.is_partially_contained_by(mypath2) except AssertionError: return False Index: svgscripts/datatypes/page.py =================================================================== --- svgscripts/datatypes/page.py (revision 110) +++ svgscripts/datatypes/page.py (revision 111) @@ -1,430 +1,432 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile, basename from progress.bar import Bar from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.parser import parse_path import re import sys import warnings from .box import Box from .color import Color from .image import Image, SVGImage from .faksimile_image import FaksimileImage from .faksimile_position import FaksimilePosition from .imprint import Imprint from .lineNumber import LineNumber from .line import Line from .mark_foreign_hands import MarkForeignHands from .matrix import Matrix from .path import Path from .positional_word_part import PositionalWordPart from .super_page import SuperPage from .style import Style from .text_connection_mark import TextConnectionMark from .text_field import TextField from .transkriptionField import TranskriptionField from .writing_process import WritingProcess from .word import Word from .word_deletion_path import WordDeletionPath from .word_insertion_mark import WordInsertionMark sys.path.append('py2ttl') from class_spec import SemanticClass sys.path.append('shared_util') from main_util import extract_paths_on_tf, get_paths_near_position FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK class Page(SemanticClass,SuperPage): """ This class represents a page. Args: xml_source_file (str): name of the xml file to be instantiated. faksimile_image: FaksimileImage. faksimile_svgFile: svg file containing information about word positions. """ UNITTESTING = False def __init__(self, xml_source_file=None, faksimile_image=None, faksimile_svgFile=None, add_paths_near_words=False, warn=False, number=None): if xml_source_file is not None: super(Page,self).__init__(xml_source_file) self.update_property_dictionary('faksimile_image', faksimile_image) self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile) self.init_all_properties() self.add_style(style_node=self.page_tree.getroot().find('.//style')) self.faksimile_text_field = None self.svg_text_field = None self.init_node_objects() self.warn = warn self.add_deletion_paths_to_words(add_paths_near_words) else: self.page_tree = None self.number = number def add_deletion_paths_to_words(self, add_paths_near_words=False): """Add deletion paths to words. """ words = [ word for word in self.words if (len(word.word_parts) == 0 and word.deleted and len(word.deletion_paths) == 0)\ or 'add_paths_near_words' in word.process_flags ] words += [ word for word in self.words\ if len(word.word_parts) > 0 and True in\ [ (wp.deleted and len(wp.deletion_paths) == 0) for wp in word.word_parts ]] if len(words) > 0 and ((self.svg_file is not None and isfile(self.svg_file))\ or (self.source is not None and isfile(self.source))): svg_file = self.svg_file if self.svg_file is not None else self.source transkription_field = TranskriptionField(svg_file) tr_xmin = transkription_field.xmin if (self.svg_image is None or self.svg_image.text_field is None) else 0 tr_ymin = transkription_field.ymin if (self.svg_image is None or self.svg_image.text_field is None) else 0 word_deletion_paths = self.word_deletion_paths index = 0 dp_updated = False while index < len(words): word = words[index] word.add_deletion_paths(word_deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) if len(word.deletion_paths) > 0 or True in [ len(w.deletion_paths) > 0 for w in word.word_parts ]: deletion_paths = word.deletion_paths for wp in word.word_parts: deletion_paths += wp.deletion_paths for deletion_path in deletion_paths: if deletion_path not in self.word_deletion_paths: self.word_deletion_paths.append(deletion_path) elif not dp_updated: word_deletion_paths = extract_paths_on_tf(self) dp_updated = True index -= 1 if add_paths_near_words\ and ('add_paths_near_words' in word.process_flags\ or ((word.deleted and len(word.deletion_paths) == 0)\ or True in [ (w.deleted and len(w.deletion_paths) == 0) for w in word.word_parts ])): if not dp_updated\ and 'add_paths_near_words' in word.process_flags: word_deletion_paths = extract_paths_on_tf(self) dp_updated = True transform = None tp = None target_word = word paths_near_word = [] if word.deleted and len(word.transkription_positions) > 0: transform = word.transkription_positions[0].transform for tp in word.transkription_positions: word.deletion_paths_near_word += get_paths_near_position(tp, word_deletion_paths) elif len(word.word_parts) > 0: for wp in word.word_parts: if wp.deleted and len(wp.transkription_positions) > 0: target_word = wp for tp in wp.transkription_positions: wp.deletion_paths_near_word = get_paths_near_position(tp, word_deletion_paths) if self.warn and (word.deleted and len(word.deletion_paths) == 0): warnings.warn(\ f'WARNING: {self.title} {self.number}: {word.id} on {word.line_number}, {word.text} has no deletion paths! {target_word.deletion_paths_near_word}, {transform}') index += 1 @classmethod def create_cls(cls, xml_source_file=None, create_dummy_page=False, page_node=None): """Create a Page. """ if not create_dummy_page: return cls(xml_source_file) else: m = re.match(r'(.*)(page[0]*)(.*)(\.xml)', xml_source_file) if m is not None and len(m.groups()) > 3: number = m.group(3) else: number = basename(xml_source_file).replace('.xml','') return cls(number=number) @classmethod def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None): """Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT. [optional: instantiation depends on the fulfilment of a status_contains and/or on the selection of some words by a word_selection_function]. """ source_tree = ET.parse(xml_file) if source_tree.getroot().find('metadata/type').text == cls.FILE_TYPE_SVG_WORD_POSITION: page = cls(xml_file) if word_selection_function is None or len(word_selection_function(page.words)) > 0: return [ page ] else: return [] elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT: pages = [] xpath = '//page/@output' if status_contains != '' and status_not_contain != '': xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain) elif status_contains != '': xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains) elif status_not_contain != '': xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain) for xml_source_file in source_tree.xpath(xpath): if isfile(xml_source_file): pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function) return pages else: return [] @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = { 'number': { 'class': str, 'cardinality': 1}} properties.update(cls.create_semantic_property_dictionary('faksimile_image', FaksimileImage, subPropertyOf=cls.HAS_IMAGE)) properties.update(cls.create_semantic_property_dictionary('faksimile_text_field', TextField,\ name='pageIsOnFaksimileTextField', label='page is on faksimile text field',\ comment='Relates a page to the text field on a svg image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD)) properties.update(cls.create_semantic_property_dictionary('orientation', str)) properties.update(cls.create_semantic_property_dictionary('svg_image', SVGImage, subPropertyOf=cls.HAS_IMAGE)) properties.update(cls.create_semantic_property_dictionary('svg_text_field', TextField,\ name='pageIsOnSVGTextField', label='page is on svg text field',\ comment='Relates a page to the text field on a faksimile image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD)) - for key in [ 'imprints', 'lines', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks']: + for key in [ 'lines','imprints', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks']: properties.update(cls.create_semantic_property_dictionary(key, list)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) def get_word_deletion_path(self, path=None, d_attribute=None) ->WordDeletionPath: """Return a word deletion path that belongs to page. """ if path is None and d_attribute is None: raise Exception('ERROR: get_word_deletion_path needs a path or a d_attribute!') if d_attribute is None: d_attribute = path.d_attribute page_paths = [ dpath for dpath in self.word_deletion_paths if dpath.d_attribute == d_attribute ] if len(page_paths) > 0: return page_paths[0] else: dpath = WordDeletionPath.create_cls(self, path=path, d_attribute=d_attribute) if dpath is not None: dpath.id = len(self.word_deletion_paths) self.word_deletion_paths.append(dpath) dpath.attach_object_to_tree(self.page_tree) return dpath def init_node_objects(self): """Initialize all node objects. """ self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ] self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ] self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ] - self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ] + #self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ] + self.words += [ TextConnectionMark.instantiate_as_word(node, id=index+len(self.words))\ + for index, node in enumerate(self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG)) ] self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ] self.imprints = [ Imprint.create_cls_from_node(imprint_node, self.lines) for imprint_node in self.page_tree.getroot().xpath('//' + Imprint.XML_TAG) ] self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ] self.word_deletion_paths = [ WordDeletionPath.create_cls(self, node=node) for node in self.page_tree.xpath('./' + WordDeletionPath.XML_TAG) ] if self.faksimile_image is not None and self.faksimile_image.text_field is not None: self.faksimile_text_field = self.faksimile_image.text_field if self.svg_image is not None and self.svg_image.text_field is not None: self.svg_text_field = self.svg_image.text_field for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks: simple_word.init_word(self) for wim in self.word_insertion_marks: if wim.line_number > -1: wim.line = [ line for line in self.lines if line.id == wim.line_number ][0] def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]): """Update word ids and attach them to page.page_tree. """ if not self.is_locked(): update_function_on_word = [ update_function_on_word ]\ if type(update_function_on_word) != list\ else update_function_on_word for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG): node.getparent().remove(node) for index, word in enumerate(self.words): word.id = index for func in update_function_on_word: if callable(func): func(word) word.attach_word_to_tree(self.page_tree) for index, mark_foreign_hands in enumerate(self.mark_foreign_hands): mark_foreign_hands.id = index if MarkForeignHands in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(mark_foreign_hands) mark_foreign_hands.attach_word_to_tree(self.page_tree) for index, text_connection_mark in enumerate(self.text_connection_marks): text_connection_mark.id = index if TextConnectionMark in include_special_words_of_type: for func in update_function_on_word: if callable(update_function_on_word): func(text_connection_mark) text_connection_mark.attach_word_to_tree(self.page_tree) else: print('locked') def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None): """Update the data source of page. """ if faksimile_svgFile is not None: self.faksimile_svgFile = faksimile_svgFile data_node = self.page_tree.xpath('.//data-source')[0]\ if len(self.page_tree.xpath('.//data-source')) > 0\ else ET.SubElement(self.page_tree.getroot(), 'data-source') data_node.set('file', self.faksimile_svgFile) if xml_correction_file is not None: data_node.set('xml-corrected-words', xml_correction_file) def update_line_number_area(self, transkription_field, svg_tree=None, set_to_text_field_zero=True): """Determines the width of the area where the line numbers are written in the page.source file. """ THRESHOLD = 0.4 if svg_tree is None: svg_tree = ET.parse(self.source) if len(self.line_numbers) > 1: line_number = self.line_numbers[9]\ if transkription_field.is_page_verso() and len(self.line_numbers) > 8\ else self.line_numbers[1] ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\ if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\ and LineNumber.IS_A_LINE_NUMBER(item)\ and LineNumber(raw_text_node=item).id == line_number.id ] if len(ln_nodes) > 0: matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform')) if transkription_field.is_page_verso(): transkription_field.add_line_number_area_width(matrix.getX()) elif self.svg_file is not None and isfile(self.svg_file): svg_path_tree = ET.parse(self.svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } svg_x = matrix.getX() svg_y = self.line_numbers[1].bottom + transkription_field.ymin\ if set_to_text_field_zero\ else self.line_numbers[1].bottom use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\ .format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces) if len(use_nodes) > 0: symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '') d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces) if len(d_strings) > 0 and d_strings[0] != '': path = parse_path(d_strings[0]) xmin, xmax, ymin, ymax = path.bbox() width = xmax - xmin transkription_field.add_line_number_area_width(matrix.getX() + width) def update_page_type(self, transkription_field=None): """Adds a source to page and attaches it to page_tree. """ if self.number.endswith('r')\ or self.number.endswith('v'): self.page_type = Page.PAGE_VERSO\ if self.number.endswith('v')\ else Page.PAGE_RECTO else: if transkription_field is None: if self.source is None or not isfile(self.source): raise FileNotFoundError('Page does not have a source!') transkription_field = TranskriptionField(self.source, multipage_index=self.multipage_index) self.page_type = Page.PAGE_VERSO\ if transkription_field.is_page_verso()\ else Page.PAGE_RECTO self.page_tree.getroot().set('pageType', self.page_type) def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False): """Update styles of words and add them to their transkription_positions. Args: add_to_parents: Add styles also to word (and if not None to manuscript). partition_according_to_styles: Partition word if its transkription_positions have different styles. """ style_dictionary = {} if words is None: words = self.words for word in words: if len(word.word_parts) > 0: self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\ add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles) for transkription_position in word.transkription_positions: if len(transkription_position.positional_word_parts) > 0: style_class = transkription_position.positional_word_parts[0].style_class writing_process_id = -1 for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]: writing_process_id = self.fontsizekey2stage_mapping.get(font_key) style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id) if create_css: if style_dictionary.get((style_class_key, word.deleted)) is None: color = None if len(word.deletion_paths) > 0: if word.deletion_paths[0].style_class is not None\ and word.deletion_paths[0].style_class != ''\ and self.style_dict.get(word.deletion_paths[0].style_class) is not None: color = Color.create_cls_from_style_object(self.style_dict.get(word.deletion_paths[0].style_class)) else: color = Color() style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\ create_css=create_css, deletion_color=color, writing_process_id=style_class_key[1] ) transkription_position.style = style_dictionary[(style_class_key, word.deleted)] #print(style_dictionary[(style_class_key, word.deleted)]) else: if style_dictionary.get(style_class_key) is None: style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css) style_dictionary[style_class_key].writing_process_id = style_class_key[1] transkription_position.style = style_dictionary[style_class_key] if add_to_parents and transkription_position.style not in word.styles: word.styles.append(transkription_position.style) if partition_according_to_styles: word.split_according_to_status('style', splits_are_parts=True) if manuscript is not None\ and add_to_parents: manuscript.update_styles(*style_dictionary.values()) def __eq__(self, other): """Returns true if self is qualitatively identical to other. """ if other is None: return False if self.page_tree is None and other.page_tree is None: return self.number == other.number if self.page_tree is None or other.page_tree is None: return False return self.page_tree.docinfo.URL == other.page_tree.docinfo.URL def __hash__(self): """Return a hash value for self. """ try: if self.page_tree is None: return hash(self.number) except AttributeError: print(self) return hash(self.number) return hash(self.page_tree.docinfo.URL) Index: svgscripts/datatypes/faksimile.py =================================================================== --- svgscripts/datatypes/faksimile.py (revision 110) +++ svgscripts/datatypes/faksimile.py (revision 111) @@ -1,204 +1,205 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a faksimile page. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" import re from lxml import etree as ET from os import path from os.path import isdir, isfile, sep, basename from svgpathtools.parser import parse_path from .faksimile_image import FaksimileImage from .matrix import Matrix from .text_field import TextField from .word_position import WordPosition class FaksimilePage: """ This class represents a faksimile page. Args: xml_target_file (str): name of the xml file to which page info will be written. xml_source_file (str): name of the xml file that will be instantiated. """ XML_TAG = 'faksimile-page' def __init__(self, xml_source_file=None, xml_target_file=None, title=None, page_number=None, svg_source_file=None, faksimile_image=None, text_field=None): xml_file = xml_source_file if xml_source_file is not None else xml_target_file self.title = title self.page_number = page_number self.xml_file = xml_file if xml_file is not None and isfile(xml_file): parser = ET.XMLParser(remove_blank_text=True) self.page_tree = ET.parse(xml_file, parser) self.title = self.page_tree.getroot().get('title') self.page_number = self.page_tree.getroot().get('page-number') self.width = float(self.page_tree.getroot().get('width')) if bool(self.page_tree.getroot().get('width')) else 0.0 self.height = float(self.page_tree.getroot().get('height')) if bool(self.page_tree.getroot().get('height')) else 0.0 else: self.page_tree = ET.ElementTree(ET.Element(self.XML_TAG)) if title is not None: self.page_tree.getroot().set('title', title) if page_number is not None: self.page_tree.getroot().set('page-number', str(page_number)) if xml_target_file is not None: self.remove_tags_from_page_tree([WordPosition.FAKSIMILE]) if svg_source_file is not None: self.page_tree.getroot().set('svg-source-file', svg_source_file) if faksimile_image is not None: faksimile_image.attach_object_to_tree(self.page_tree) if text_field is not None: text_field.attach_object_to_tree(self.page_tree) self.svg_source_file = self.page_tree.getroot().get('svg-source-file') self.faksimile_image = FaksimileImage(node=self.page_tree.getroot().find('.//' + FaksimileImage.XML_TAG))\ if len(self.page_tree.getroot().findall('.//' + FaksimileImage.XML_TAG)) > 0 else None self.text_field = TextField(node=self.page_tree.getroot().find('.//' + TextField.XML_TAG))\ if len(self.page_tree.getroot().findall('.//' + TextField.XML_TAG)) > 0 else None self.word_positions = [ WordPosition(node=node) for node in self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE) ]\ if len(self.page_tree.getroot().findall('.//' + WordPosition.FAKSIMILE)) > 0 else [] def append_word_position(self, word_position): """Appends word_position to word_positions and attaches it to page_tree. """ self.word_positions.append(word_position) word_position.attach_object_to_tree(self.page_tree) @classmethod def get_faksimile_pages(cls, svg_file, page_number='') -> list: """Creates and returns text fields contained in a svg_file as a list. """ svg_tree = ET.parse(svg_file) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } return cls.GET_FAKSIMILEPAGES(svg_tree, namespaces=namespaces, page_number=page_number) @staticmethod def GET_FAKSIMILEPAGES(svg_tree, namespaces=None, page_number='') -> list: """Creates and returns text fields contained in a svg_tree as a list. """ THRESHOLD_X = 10 if namespaces is None: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } source_file_name = svg_tree.docinfo.URL image = FaksimileImage.CREATE_IMAGE(svg_tree.getroot().find('.//image', svg_tree.getroot().nsmap), source_file_name) xml_dir = '.{}xml'.format(sep) faksimile_pages = list() title_string = re.sub(r'[,_][0-9]+.*\.svg', '', basename(source_file_name)) if re.match(r'.*-\d+[a-z]$', title_string): title_string = re.sub(r'-\d+[a-z]$', '', title_string) title = title_string.replace('-', ' ') rect_list = [ rect for rect in svg_tree.getroot().findall('.//rect', svg_tree.getroot().nsmap)\ if rect.get('id', svg_tree.getroot().nsmap).startswith(title_string)\ and rect.get('id', svg_tree.getroot().nsmap).endswith(str(page_number)) ] for text_field_rect in rect_list: tf_x = float(text_field_rect.get('x', svg_tree.getroot().nsmap)) - image.x tf_y = float(text_field_rect.get('y', svg_tree.getroot().nsmap)) - image.y tf_width = float(text_field_rect.get('width', svg_tree.getroot().nsmap)) tf_height = float(text_field_rect.get('height', svg_tree.getroot().nsmap)) tf_matrix = Matrix(transform_matrix_string=text_field_rect.get('transform'))\ if bool(text_field_rect.get('transform'))\ else None id = text_field_rect.get('id', svg_tree.getroot().nsmap) target_file_name = xml_dir + sep + id + '.xml' if isdir(xml_dir) else id + '.xml' page_number = re.sub(r'.*[,_]', '', id) if page_number.startswith('0'): page_number = page_number.lstrip('0') text_field = TextField(id=id, width=tf_width, height=tf_height, x=tf_x, y=tf_y, matrix=tf_matrix) faksimile_page = FaksimilePage(xml_target_file=target_file_name, svg_source_file=source_file_name,\ title=title, page_number=page_number, faksimile_image=image, text_field=text_field) x_min = text_field.xmin + image.x y_min = text_field.ymin + image.y #rect_titles = svg_tree.getroot().xpath('//ns:rect[@x>"{0}" and @x<"{1}" and @y>"{2}" and @y<"{3}" and @id!="{4}"]/ns:title'.format(\ # x_min, text_field.xmax + image.x - THRESHOLD_X, y_min, text_field.ymax + image.y, text_field.id), namespaces=namespaces) rect_titles = get_paths_inside_rect(svg_tree, '//ns:rect/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\ y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces) rect_titles += get_paths_inside_rect(svg_tree, '//ns:path/ns:title', x_min, text_field.xmax + image.x - THRESHOLD_X,\ y_min, text_field.ymax + image.y, text_field.id, namespaces=namespaces) for rect_title in rect_titles: rect = rect_title.getparent() x, y, height, width = 0.0, 0.0, 0.0, 0.0 if rect.tag.endswith('path'): path = parse_path(rect.get('d')) x, xmax, y, ymax = path.bbox() width = xmax - x height = ymax - y else: x = float(rect.get('x', svg_tree.getroot().nsmap)) y = float(rect.get('y', svg_tree.getroot().nsmap)) height = float(rect.get('height', svg_tree.getroot().nsmap)) width = width=float(rect.get('width', svg_tree.getroot().nsmap)) matrix = None if bool(rect.get('transform')): matrix = Matrix(transform_matrix_string=rect.get('transform')) + text = re.sub(r'(\s(?=[-;:.,?!’–])|(?<=[-;:.,?!’–])\s)', '', rect_title.text) faksimile_page.append_word_position(\ - WordPosition(id=rect.get('id', svg_tree.getroot().nsmap), text=rect_title.text, height=height,\ + WordPosition(id=rect.get('id', svg_tree.getroot().nsmap), text=text, height=height,\ width=width, x=x, y=y, matrix=matrix, tag=WordPosition.FAKSIMILE)) faksimile_pages.append(faksimile_page) return faksimile_pages def remove_tags_from_page_tree(self, list_of_tags_to_remove): """Removes the tags specified in the list from the target tree. """ for xpath2remove in list_of_tags_to_remove: for node in self.page_tree.xpath('//' + xpath2remove): node.getparent().remove(node) def get_paths_inside_rect(svg_tree, xpath, x_min, x_max, y_min, y_max, not_id, namespaces={}): """Returns a list of all paths selected by xpath that are inside x_min, x_max, y_min, y_max and do not have id == not_id. """ paths = [] if len(namespaces) == 0: namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } for path_node in svg_tree.xpath(xpath, namespaces=namespaces): append_node = path_node if not path_node.tag.endswith('path') and not path_node.tag.endswith('rect'): path_node = path_node.getparent() x, xmax, y, ymax = -1, -1, -1, -1 init_xy = False if path_node.tag.endswith('rect'): x = float(path_node.get('x')) if bool(path_node.get('x')) else -1 y = float(path_node.get('y')) if bool(path_node.get('y')) else -1 xmax = x + float(path_node.get('width')) if bool(path_node.get('width')) else -1 ymax = y + float(path_node.get('height')) if bool(path_node.get('height')) else -1 init_xy = True elif path_node.tag.endswith('path') and bool(path_node.get('d')) and path_node.get('d') != 0: path = parse_path(path_node.get('d')) x, xmax, y, ymax = path.bbox() init_xy = True if init_xy: if bool(path_node.get('transform')): matrix = Matrix(transform_matrix_string=path_node.get('transform')) x, xmax = matrix.get_new_x(x=x, y=y), matrix.get_new_x(x=xmax, y=ymax) y, ymax = matrix.get_new_y(x=x, y=y), matrix.get_new_y(x=xmax, y=ymax) width = xmax - x height = ymax - y if x > x_min and x < x_max\ and y > y_min and y < y_max\ and path_node.get('id') != not_id: paths.append(append_node) return paths Index: svgscripts/datatypes/editor_comment.py =================================================================== --- svgscripts/datatypes/editor_comment.py (revision 110) +++ svgscripts/datatypes/editor_comment.py (revision 111) @@ -1,81 +1,84 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a comment by the editors. """ # Copyright (C) University of Basel 2020 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import re import sys from .attachable_object import AttachableObject +from .text import Text sys.path.append('py2ttl') from class_spec import SemanticClass from xml_conform_dictionary import XMLConformDictionary class EditorComment(AttachableObject,SemanticClass): """ This class represents a comment by the editors. """ XML_TAG = 'editor-comment' - def __init__(self, id=0, comment=None, is_uncertain=False): + def __init__(self, id=0, comment=None, is_uncertain=False, text=None): self.id = id self.comment = comment self.is_uncertain = is_uncertain + self.text = text def attach_object_to_tree(self, target_tree): """Attach object to tree. """ obj_node = self.get_or_create_node_with_id(target_tree) obj_node.set('type', self.__class__.__name__) xml_conform_dictionary = XMLConformDictionary.create_cls_from_data_object(self) xml_conform_dictionary.attach_data_to_tree(obj_node) @classmethod def create_cls_from_node(cls, node): """Initialize a cls from node. [:return:] cls """ target_cls = cls cls_type = node.get('type') target_classes = [ target for target in cls.__subclasses__() if target.__name__ == cls_type ] if len(target_classes) > 0: target_cls = target_classes[0] return XMLConformDictionary.CREATE_INSTANCEOF_CLASS_FROM_NODE(target_cls, node) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ properties = {} properties.update(cls.create_semantic_property_dictionary('is_uncertain', bool,\ name='isUncertain', label='whether something is uncertain')) properties.update(cls.create_semantic_property_dictionary('comment', str)) + properties.update(cls.create_semantic_property_dictionary('text', Text, name='commentHasText')) dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties } return cls.return_dictionary_after_updating_super_classes(dictionary) Index: svgscripts/datatypes/text_connection_mark.py =================================================================== --- svgscripts/datatypes/text_connection_mark.py (revision 110) +++ svgscripts/datatypes/text_connection_mark.py (revision 111) @@ -1,101 +1,114 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a text connection mark ("Anschlusszeichen"). """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import sys +from .editor_comment import EditorComment from .footnotes import extract_footnotes from .reference import Reference from .special_word import SpecialWord from .transkriptionField import TranskriptionField +from .word import Word class TextConnectionMark(SpecialWord): """ This class represents a text connection mark. """ XML_TAG = 'text-connection-mark' XML_SUB_TAG = Reference.XML_TAG SPECIAL_CHAR_LIST = [ '*', 'W' ] FOOTNOTE_CONTAINS = [ 'Anschlußzeichen', 'Hinzufügungszeichen' ] def __init__(self, id=0, line_number=-1, text='*', transkription_positions=[], faksimile_positions=[], text_source=None): super(TextConnectionMark, self).__init__(id=id, text=text, line_number=line_number, transkription_positions=transkription_positions,\ faksimile_positions=faksimile_positions) self.text_source = text_source def add_content(self, node): """Adds content to TextConnectionMark. """ self.text_source = Reference.create_cls(node=node) def attach_word_to_tree(self, target_tree): """Attaches TextConnectionMark to tree target_tree. """ node = super(TextConnectionMark,self).attach_word_to_tree(target_tree) if self.text_source is not None: self.text_source.attach_object_to_tree(node) @classmethod + def instantiate_as_word(cls, node, id=0) ->Word: + """Instantiate a TextConnectionMark as a Word. + """ + tcm = cls.create_cls(node) + word = Word(id=id, text=tcm.text, line_number=tcm.line_number, transkription_positions=tcm.transkription_positions, faksimile_positions=tcm.faksimile_positions) + if tcm.text_source is not None: + comment = 'Hinzufügungszeichen zu ' + tcm.text_source.toString() + word.editor_comments.append(EditorComment(comment=comment, is_uncertain=tcm.text_source.is_uncertain)) + return word + + @classmethod def find_content_in_footnotes(cls, page, transkription_field=None, svg_tree=None, title='', page_number='', footnotes=None, skip_after=-1.0): """Find content for the TextConnectionMark. """ if footnotes is None: if svg_tree is None: svg_tree = ET.parse(page.source) if transkription_field is None: transkription_field = TranskriptionField(page.source) footnotes = extract_footnotes(page, transkription_field=transkription_field, svg_tree=svg_tree, contains_strings=cls.FOOTNOTE_CONTAINS, skip_after=skip_after) else: footnotes = [ footnote for footnote in footnotes if True in [ contains_string in footnote.content for contains_string in cls.FOOTNOTE_CONTAINS ] ] for text_connection_mark in page.text_connection_marks: relevant_footnotes = [ footnote.content for footnote in footnotes if footnote.content.strip().startswith(str(text_connection_mark.line_number)+ ':') ] if len(relevant_footnotes) > 0: footnote_string = relevant_footnotes[0].strip() line_number = int(footnote_string.split(':')[0]) is_uncertain = footnote_string.endswith('?') reference_string = footnote_string.replace('?', '').split('zu ')[1].strip() text_connection_mark.text_source = Reference.create_cls(is_uncertain=is_uncertain,\ reference_string=reference_string, title=page.title, page_number=page.number) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = super(TextConnectionMark,cls).get_semantic_dictionary() dictionary['properties'].update(cls.create_semantic_property_dictionary('text_source', Reference,\ cardinality=1, name='textConnectionMarkHasTextSource', label='text connection mark has a text source')) return cls.return_dictionary_after_updating_super_classes(dictionary) @classmethod def get_special_char_list(cls): """Returns a list of the chars that define this special word. """ return cls.SPECIAL_CHAR_LIST Index: svgscripts/datatypes/transkription_position.py =================================================================== --- svgscripts/datatypes/transkription_position.py (revision 110) +++ svgscripts/datatypes/transkription_position.py (revision 111) @@ -1,197 +1,203 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a transkription word position. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET from os.path import isfile import sys from .debug_message import DebugMessage from .image import SVGImage from .positional_word_part import PositionalWordPart from .word_position import WordPosition from .matrix import Matrix sys.path.append('py2ttl') from class_spec import SemanticClass class TranskriptionPosition(WordPosition): """ This class represents the position of a word on the transkription as it is displayed by a svg image. @label position of a word on the topological transkription Args: id (int): word id matrix (datatypes.Matrix): matrix containing information about transformation. height (float): height of word width (float): width of word x (float): x position of word y (float): y position of word positional_word_parts a list of (datatypes.positional_word_part) PositionalWordPart debug_message a (datatypes.debug_message) DebugMessage """ ADD2X = 0.15 ADD2TOP = 1.0 ADD2BOTTOM = 0.2 HEIGHT_FACTOR = 1.1 # factor that multiplies biggest_font_size -> height XML_TAG = WordPosition.TRANSKRIPTION def __init__(self, id=0, node=None, height=0.0, width=0.0, x=0.0, y=0.0, matrix=None, positional_word_parts=None, debug_message=None): super(TranskriptionPosition, self).__init__(id=id, node=node, height=height, width=width, x=x, y=y, matrix=matrix, tag=WordPosition.TRANSKRIPTION) self.positional_word_parts = positional_word_parts if positional_word_parts is not None else [] self.debug_message = debug_message self.deleted = False self._deletion_paths = [] self.has_box = None self.style = None self.svg_image = None if node is not None: self.debug_message = DebugMessage(node=node.xpath('.//' + DebugMessage.XML_TAG)[0])\ if len(node.xpath('.//' + DebugMessage.XML_TAG)) > 0 else None self.positional_word_parts = [ PositionalWordPart(node=pwp_node) for pwp_node in node.xpath('.//' + PositionalWordPart.XML_TAG) ] self.attachable_objects += self.positional_word_parts if self.debug_message is not None: self.attachable_objects.append(self.debug_message) def get_text(self): """Returns the concatenated text of all positional_word_parts. """ return ''.join([pwp.text for pwp in self.positional_word_parts]) def is_mergebale_with(self, other) -> bool: """Return whether self and other have same writing_process_id or style. """ if self.writing_process_id == other.writing_process_id: return True if self.writing_process_id == -1 or other.writing_process_id == -1\ and (len(self.positional_word_parts) > 0 and len(other.positional_word_parts) > 0): return self.positional_word_parts[0].style_class == other.positional_word_parts[0].style_class return False def split(self, split_position, second_split=-1) ->list: """Split a transkription_position in two at split_position. :return: a list of the new transkription_positions """ transkription_positions = [] left_pwp = [ pwp for pwp in self.positional_word_parts if pwp.left + pwp.width < split_position ] transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(left_pwp, transkription_position_id=self.id) if second_split == -1: right_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp ] next_id = int(self.id) + 1 transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(right_pwp, transkription_position_id=str(next_id)) else: middle_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp and pwp.left + pwp.width < second_split ] next_id = int(self.id) + 1 transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(middle_pwp, transkription_position_id=str(next_id)) right_pwp = [ pwp for pwp in self.positional_word_parts if pwp not in left_pwp and pwp not in middle_pwp ] next_id = int(self.id) + 1 transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(right_pwp, transkription_position_id=str(next_id)) return transkription_positions def update_positional_word_parts(self, positional_word_parts): """Update positional_word_parts. """ if len(self.positional_word_parts) > 0 and self.positional_word_parts in self.attachable_objects: self.attachable_objects.remove(self.positional_word_parts) self.positional_word_parts = positional_word_parts self.attachable_objects += self.positional_word_parts @staticmethod def CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(positional_word_parts, debug_message=None, debug_msg_string=None, transkription_position_id=0): """Creates a list of TranskriptionPosition from a list of (datatypes.positional_word_part) PositionalWordPart. [:return:] a list of (datatypes.transkription_position) TranskriptionPosition """ TOPCORRECTION = 1 debug_message = DebugMessage(message=debug_msg_string)\ if debug_msg_string is not None else debug_message transkription_positions = [] if len(positional_word_parts) < 1: return [] matrix = positional_word_parts[0].transform index = 0 matrices_differ = False style_class = positional_word_parts[0].style_class styles_differ = False while index < len(positional_word_parts) and not matrices_differ and not styles_differ: if Matrix.DO_CONVERSION_FACTORS_DIFFER(matrix, positional_word_parts[index].transform): matrices_differ = True elif style_class != positional_word_parts[index].style_class: styles_differ = True else: index += 1 if (matrices_differ or styles_differ) and index < len(positional_word_parts): debug_msg_string = 'matrices differ' if matrices_differ else 'styles differ' transkription_positions += TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\ positional_word_parts[index:], debug_msg_string=debug_msg_string, transkription_position_id=int(transkription_position_id)+1) positional_word_parts = positional_word_parts[:index] heighest_pwp = sorted(positional_word_parts, key=lambda pwp: pwp.height, reverse=True)[0] toppest_pwp = sorted(positional_word_parts, key=lambda pwp: pwp.top)[0] height = heighest_pwp.height + 2*TOPCORRECTION if heighest_pwp != toppest_pwp: height += abs(heighest_pwp.top-toppest_pwp.top) x = positional_word_parts[0].left - TranskriptionPosition.ADD2X - y = toppest_pwp.top - TOPCORRECTION width = positional_word_parts[-1].left - x\ + positional_word_parts[-1].width + TranskriptionPosition.ADD2X + if matrix is not None and matrix.isRotationMatrix(): + x = positional_word_parts[0].left - matrix.matrix[Matrix.XINDEX]\ + if positional_word_parts[0].left - matrix.matrix[Matrix.XINDEX] > 0\ + else 0.0 + y = toppest_pwp.top - TOPCORRECTION\ + if matrix is None or not matrix.isRotationMatrix()\ + else height*-1 for pwp_index, pwp in enumerate(positional_word_parts): pwp.id = pwp_index transkription_positions.insert(0, TranskriptionPosition(id=transkription_position_id, height=height, width=width, x=x, y=y, matrix=matrix,\ positional_word_parts=positional_word_parts, debug_message=debug_message)) return transkription_positions @staticmethod def CREATE_TRANSKRIPTION_POSITION_LIST(page, word_part_objs, matrix=None, debug_msg_string=None, transkription_field=None, svg_path_tree=None, namespaces=None): """Creates a list of TranskriptionPosition from word_part_objs (i.e. a list of dictionaries with the keys: text, x, y, matrix, class). [:return:] a list of (datatypes.transkription_position) TranskriptionPosition """ positional_word_parts = [] debug_message = DebugMessage(message=debug_msg_string)\ if debug_msg_string is not None else None if page.svg_file is not None and isfile(page.svg_file): svg_path_tree = ET.parse(page.svg_file) if svg_path_tree is None else svg_path_tree namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }\ if namespaces is None else namespaces xmin = 0.0 ymin = 0.0 if transkription_field is not None: xmin = transkription_field.xmin ymin = transkription_field.ymin for part_obj in word_part_objs: positional_word_parts += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(\ part_obj, svg_path_tree, namespaces, page, start_id=len(positional_word_parts),\ xmin=xmin, ymin=ymin) else: positional_word_parts = PositionalWordPart.CREATE_SIMPLE_POSITIONAL_WORD_PART_LIST(page, word_part_objs) if len(positional_word_parts) > 0: return TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(positional_word_parts, debug_message=debug_message) else: return [ TranskriptionPosition(matrix=matrix, debug_message=debug_message) ] Index: svgscripts/datatypes/reference.py =================================================================== --- svgscripts/datatypes/reference.py (revision 110) +++ svgscripts/datatypes/reference.py (revision 111) @@ -1,173 +1,187 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This class can be used to represent a text reference. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" from lxml import etree as ET import re import sys from .attachable_object import AttachableObject sys.path.append('py2ttl') from class_spec import SemanticClass NON_INT = re.compile('\D+') class Reference(AttachableObject,SemanticClass): """ This class represents a text reference. Args: id (int): object id first_line (int) first line of reference last_line (int) last line of reference is_uncertain (bool) whether reference is uncertain title (str) title of reference page_number (str) page_number of reference tag (str) xml tag """ XML_TAG = 'reference' intKeys = [ 'first_line', 'last_line'] boolKeys = [ 'is_uncertain' ] stringKeys = [ 'title', 'page_number', 'word_reference' ] def __init__(self, node=None, id=0, first_line=-1, last_line=-1, is_uncertain=False, title=None, page_number=None, word_reference=None, tag=XML_TAG): self.intKeys = [] self.intKeys += Reference.intKeys self.intKeys.append('id') self.stringKeys = [] self.stringKeys += Reference.stringKeys self.boolKeys = [] self.boolKeys += Reference.boolKeys self.id = id - self.first_line = first_line - self.last_line = last_line + self.first_line = int(first_line) + self.last_line = int(last_line) self.is_uncertain = is_uncertain self.title = title self.page_number = page_number self.word_reference = word_reference self.tag = tag def attach_object_to_tree(self, target_tree): """Attach object to tree. """ if target_tree.__class__.__name__ == '_ElementTree': target_tree = target_tree.getroot() obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \ if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \ else ET.SubElement(target_tree, self.tag) for key in self.boolKeys: if self.__dict__[key] is not None: obj_node.set(key.replace('_','-'), str(self.__dict__[key]).lower()) for key in self.intKeys: if self.__dict__[key] is not None and self.__dict__[key] > -1: obj_node.set(key.replace('_','-'), str(self.__dict__[key])) for key in self.stringKeys: if self.__dict__[key] is not None and self.__dict__[key] != '': obj_node.set(key.replace('_','-'), str(self.__dict__[key])) @classmethod def create_cls_from_node(cls, node): """Creates a Reference from a (lxml.etree.Element) node. :return: (datatypes.reference) Reference """ instance = cls() for key in instance.boolKeys: xml_key = key.replace('_', '-') if bool(node.get(xml_key)): instance.__dict__[key] = node.get(xml_key) == 'true' for key in instance.intKeys: xml_key = key.replace('_', '-') if bool(node.get(xml_key)): instance.__dict__[key] = int(node.get(xml_key)) for key in instance.stringKeys: xml_key = key.replace('_', '-') if bool(node.get(xml_key)): instance.__dict__[key] = node.get(xml_key) return instance @classmethod def create_cls(cls, node=None, id=0, is_uncertain=False, reference_string='', title='', page_number=''): """Creates a Reference from a (lxml.etree.Element) node or a reference_string. :return: (datatypes.reference) Reference """ if node is not None: return cls.create_cls_from_node(node) else: first_line = -1 last_line = -1 word_reference = None if re.match(r'[0-9]+([a-z]+)*,[0-9]+(-[0-9]+)*', reference_string): page_number = reference_string.split(',')[0] line_numbers = reference_string.split(',')[1].split('-') first_line = _save_get_int(line_numbers[0]) last_line = _save_get_int(line_numbers[1]) if len(line_numbers) > 1 else -1 else: if ',' not in reference_string: if re.match(r'\D+.*', reference_string): word_reference = reference_string.strip() else: line_numbers = reference_string.split('-') first_line = _save_get_int(line_numbers[0]) last_line = _save_get_int(line_numbers[1]) if len(line_numbers) > 1 else -1 else: if ' ' not in reference_string: raise Exception('String "{}" is not a valid reference_string'.format(reference_string)) title = reference_string.split(' ')[0] return cls.create_cls(id=id, is_uncertain=is_uncertain, reference_string=reference_string[len(title)+1:],\ title=title, page_number=page_number) return cls(id=id, is_uncertain=is_uncertain, first_line=first_line, last_line=last_line,\ title=title, page_number=page_number, word_reference=word_reference) @classmethod def get_semantic_dictionary(cls): """ Creates a semantic dictionary as specified by SemanticClass. """ dictionary = {} class_dict = cls.get_class_dictionary() properties = {} properties.update({'first_line': { 'class': int, 'cardinality': 1, 'cardinality_restriction': 'maxCardinality', 'name': 'firstLineOfReference',\ 'label': 'first line of reference'}}) properties.update({'last_line': { 'class': int, 'cardinality': 1, 'cardinality_restriction': 'maxCardinality', 'name': 'lastLineOfReference',\ 'label': 'last line of reference'}}) properties.update({'word_reference': { 'class': str, 'cardinality': 1, 'cardinality_restriction': 'maxCardinality', 'name': 'wordReference',\ 'label': 'refers to word on same line'}}) properties.update({'is_uncertain': { 'class': bool, 'cardinality': 0, 'name': 'IsUncertain', 'label': 'whether something is uncertain'}}) properties.update(cls.create_semantic_property_dictionary('title', str, cardinality=0)) properties.update(cls.create_semantic_property_dictionary('page_number', str, cardinality=0)) dictionary.update({cls.CLASS_KEY: class_dict}) dictionary.update({cls.PROPERTIES_KEY: properties}) return cls.return_dictionary_after_updating_super_classes(dictionary) + + def toString(self) ->str: + """Return a string representation of reference + """ + reference = self.title + ' '\ + if self.title is not None\ + else '' + reference = reference + self.page_number + ','\ + if self.page_number is not None\ + else '' + reference = reference + str(self.first_line) + '-' + str(self.last_line)\ + if self.last_line > -1\ + else reference + str(self.first_line) + return reference def _save_get_int(line_reference) -> int: """Return line as int and remove none int str at end of str. """ return int(NON_INT.sub('', line_reference)) Index: svgscripts/process_words_post_merging.py =================================================================== --- svgscripts/process_words_post_merging.py (revision 110) +++ svgscripts/process_words_post_merging.py (revision 111) @@ -1,495 +1,495 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style from deprecated import deprecated from functools import cmp_to_key import getopt import inspect import lxml.etree as ET from operator import attrgetter import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path as PathlibPath from progress.bar import Bar import re import shutil import string from svgpathtools import svg2paths2, svg_to_paths from svgpathtools.path import Path as SVGPath from svgpathtools.path import Line import sys import tempfile import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.box import Box from datatypes.archival_manuscript import ArchivalManuscriptUnity from datatypes.mark_foreign_hands import MarkForeignHands from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.path import Path from datatypes.text_connection_mark import TextConnectionMark from datatypes.transkriptionField import TranskriptionField from datatypes.word import Word, do_paths_intersect_saveMode, update_transkription_position_ids from extract_line_continuation import extract_line_continuations from util import back_up, process_warnings4status from process_files import update_svgposfile_status from process_footnotes import categorize_footnotes sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT from main_util import extract_paths_on_tf __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False DEBUG_WORD = None MERGED_DIR = 'merged' WARNING_FOOTNOTES_ERROR = 'footnotes not processed' WARNING_LINE_CONTINUATION = 'line continuation fail' def categorize_paths(page, transkription_field=None): """Categorize all paths that are part of the transkription field. :return: a dictionary containig a list for each category of path. """ if page.source is not None and isfile(page.source): MAX_HEIGHT_LINES = 1 max_line = sorted(\ [line_number.bottom-line_number.top for line_number in page.line_numbers if line_number.id % 2 == 0],\ reverse=True)[0] + 2 if len(page.line_numbers) > 0 else 17 tr_xmin = 0.0 tr_ymin = 0.0 if (page.svg_image is None or page.svg_image.text_field is None)\ and transkription_field is not None: tr_xmin = transkription_field.xmin tr_ymin = transkription_field.ymin paths, attributes = svg_to_paths.svg2paths(page.source) allpaths_outside_tf = [] attributes_outside_tf = [] if transkription_field is None: transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) allpaths_on_tf = extract_paths_on_tf(page, outsiders=allpaths_outside_tf, outsider_attributes=attributes_outside_tf, transkription_field=transkription_field) path_dict = { 'text_area_deletion_paths': [],\ 'deletion_or_underline_paths': [],\ 'box_paths': [],\ 'dots_paths': [],\ 'word_connector_paths': [],\ 'uncategorized_paths': [] } for mypath in allpaths_on_tf: xmin, xmax, ymin, ymax = mypath.path.bbox() start_line_number = page.get_line_number(mypath.path.start.imag-tr_ymin) if abs(xmax-xmin) < 1 and abs(ymax-ymin) < 1: path_dict.get('dots_paths').append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) < max_line and mypath.path.iscontinuous() and mypath.path.isclosed(): path_dict.get('box_paths').append(mypath) elif abs(ymax-ymin) > MAX_HEIGHT_LINES and abs(ymax-ymin) > max_line and mypath.path.iscontinuous() and not mypath.path.isclosed(): path_dict.get('word_connector_paths').append(mypath) elif abs(ymax-ymin) < MAX_HEIGHT_LINES: mypath.start_line_number = start_line_number path_dict.get('deletion_or_underline_paths').append(mypath) elif start_line_number != -1 and start_line_number != page.get_line_number(mypath.path.end.imag-tr_ymin): # Check for "ladder", i.e. a path with 3 segments (seg0 is horizontal on line x, seg1 moves to line x+1, seg2 is horizontal on line x+1) if start_line_number + 1 == page.get_line_number(mypath.path.end.imag-tr_ymin)\ and len(mypath.path._segments) == 3\ and abs(mypath.path._segments[0].bbox()[3]-mypath.path._segments[0].bbox()[2]) < MAX_HEIGHT_LINES\ and abs(mypath.path._segments[2].bbox()[3]-mypath.path._segments[2].bbox()[2]) < MAX_HEIGHT_LINES: for index in 0, 2: new_path = Path(parent_path=mypath, path=SVGPath(mypath.path._segments[index])) new_path.start_line_number = page.get_line_number(new_path.path.start.imag-tr_ymin) path_dict.get('deletion_or_underline_paths').append(new_path) else: path_dict.get('text_area_deletion_paths').append(mypath) else: path_dict.get('uncategorized_paths').append(mypath) underline_path = mark_words_intersecting_with_paths_as_deleted(page, path_dict.get('deletion_or_underline_paths'), tr_xmin, tr_ymin) path_dict.update({'underline_path': underline_path}) path_dict['uncategorized_paths'] += process_word_boxes(page, path_dict.get('box_paths'), transkription_field,\ paths=allpaths_outside_tf, attributes=attributes_outside_tf, max_line=max_line) return path_dict elif not UNITTESTING: error_msg = 'Svg source file {} does not exist!'.format(page.source)\ if page.source is not None else 'Page does not contain a source file!' raise FileNotFoundError(error_msg) return {} def copy_page_to_merged_directory(page, manuscript_file=None): """Copy page to directory that contains the first version of all svg_pos_files that have been merged with the faksimile position data. MERGED_DIR is a subfolder of svg_pos_files-directory. """ svg_pos_file = PathlibPath(page.page_tree.docinfo.URL) target_dir = svg_pos_file.parent / MERGED_DIR if not target_dir.is_dir(): target_dir.mkdir() target_pos_file = target_dir / svg_pos_file.name save_page(page, str(svg_pos_file), target_svg_pos_file=str(target_pos_file), status=STATUS_MERGED_OK, manuscript_file=manuscript_file) def find_special_words(page, transkription_field=None): """Find special words, remove them from words, process their content. """ if page.source is None or not isfile(page.source): raise FileNotFoundError('Page does not have a source!') if transkription_field is None: transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) set_to_text_field_zero = (page.svg_image is None or page.svg_image.text_field is None) special_char_list = MarkForeignHands.get_special_char_list() special_char_list += TextConnectionMark.get_special_char_list() single_char_words = [ word for word in page.words if len(word.text) == 1 and word.text in special_char_list ] if not UNITTESTING: bar = Bar('find special words', max=len(single_char_words)) for word in single_char_words: not bool(UNITTESTING) and bar.next() if word.text == MarkForeignHands.CLASS_MARK: id = len(page.mark_foreign_hands) page.mark_foreign_hands.append(MarkForeignHands.create_cls_from_word(word, id=id)) page.words.remove(word) elif word.text in TextConnectionMark.SPECIAL_CHAR_LIST[0]\ or (word.text in TextConnectionMark.SPECIAL_CHAR_LIST\ and any(style in page.sonderzeichen_list for style\ in word.transkription_positions[0].positional_word_parts[0].style_class.split(' '))): id = len(page.text_connection_marks) page.text_connection_marks.append(TextConnectionMark.create_cls_from_word(word, id=id)) page.words.remove(word) not bool(UNITTESTING) and bar.finish() svg_tree = ET.parse(page.source) page.update_page_type(transkription_field=transkription_field) page.update_line_number_area(transkription_field, svg_tree=svg_tree, set_to_text_field_zero=set_to_text_field_zero) if page.marginals_source is not None: svg_tree = ET.parse(page.marginals_source) italic_classes = [ key for key in page.style_dict\ if bool(page.style_dict[key].get('font-family')) and page.style_dict[key]['font-family'].endswith('Italic') ] if len(page.mark_foreign_hands) > 0: MarkForeignHands.find_content(page.mark_foreign_hands, transkription_field, svg_tree, italic_classes=italic_classes,\ SonderzeichenList=page.sonderzeichen_list, set_to_text_field_zero=set_to_text_field_zero) if len(page.text_connection_marks) > 0: TextConnectionMark.find_content_in_footnotes(page, transkription_field, svg_tree) def mark_words_intersecting_with_paths_as_deleted(page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks all words that intersect with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] list of .path.Path that might be word_underline_paths """ if not UNITTESTING: bar = Bar('mark words that intersect with deletion paths', max=len(page.words)) for word in page.words: not bool(UNITTESTING) and bar.next() word = mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) for part_word in word.word_parts: part_word = mark_word_if_it_intersects_with_paths_as_deleted(part_word, page, deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin) word.partition_according_to_deletion() not bool(UNITTESTING) and bar.finish() # return those paths in deletion_paths that are not in page.word_deletion_paths return [ word_underline_path for word_underline_path in set(deletion_paths) - set(page.word_deletion_paths) ] def mark_word_if_it_intersects_with_paths_as_deleted(word, page, deletion_paths, tr_xmin=0.0, tr_ymin=0.0): """Marks word if it intersects with deletion paths as deleted and adds these paths to word_deletion_paths. [:return:] word """ word.deleted = False for transkription_position in word.transkription_positions: word_path = Path.create_path_from_transkription_position(transkription_position,\ tr_xmin=tr_xmin, tr_ymin=tr_ymin) intersecting_paths = [ deletion_path for deletion_path in deletion_paths\ if do_paths_intersect_saveMode(deletion_path, word_path) ] if DEBUG_WORD is not None and word.text == DEBUG_WORD.text and word.line_number == DEBUG_WORD.line_number: relevant_paths = [ path for path in deletion_paths if path.start_line_number == DEBUG_WORD.line_number ] #print(word.line_number, word_path.path.bbox(), [ path.path.bbox() for path in relevant_paths]) if len(intersecting_paths) > 0: #print(f'{word.line_number}: {word.id}, {word.text}: {intersecting_paths}') transkription_position.deleted = True transkription_position._deletion_paths += intersecting_paths for deletion_path in intersecting_paths: if deletion_path.parent_path is not None: deletion_path = deletion_path.parent_path if deletion_path not in page.word_deletion_paths: deletion_path.tag = Path.WORD_DELETION_PATH_TAG deletion_path.attach_object_to_tree(page.page_tree) page.word_deletion_paths.append(deletion_path) return word def post_merging_processing_and_saving(svg_pos_file=None, new_words=None, page=None, manuscript_file=None, target_svg_pos_file=None): """Process words after merging with faksimile word positions. """ if page is None and svg_pos_file is None: raise Exception('ERROR: post_merging_processing_and_saving needs either a Page or a svg_pos_file!') if page is None: page = Page(svg_pos_file) if page.source is None or not isfile(page.source): raise FileNotFoundError('Page instantiated from {} does not contain an existing source!'.format(svg_pos_file)) if svg_pos_file is None: svg_pos_file = page.page_tree.docinfo.URL if new_words is not None: page.words = sorted(new_words, key=attrgetter('id')) for word_node in page.page_tree.xpath('.//word'): word_node.getparent().remove(word_node) manuscript = ArchivalManuscriptUnity.create_cls(manuscript_file)\ if manuscript_file is not None\ else None copy_page_to_merged_directory(page, manuscript_file=manuscript_file) transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index) update_faksimile_line_positions(page) status = STATUS_MERGED_OK page.update_styles(manuscript=manuscript, partition_according_to_styles=True) save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file) categorize_paths(page, transkription_field=transkription_field) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('default') try: find_special_words(page, transkription_field=transkription_field) save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file) categorize_footnotes(page) save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file) extract_line_continuations(page, warning_message=WARNING_LINE_CONTINUATION) except Exception: warnings.warn(WARNING_FOOTNOTES_ERROR) status = process_warnings4status(w, [ WARNING_FOOTNOTES_ERROR, WARNING_LINE_CONTINUATION ], status, STATUS_POSTMERGED_OK) save_page(page, svg_pos_file, target_svg_pos_file=target_svg_pos_file, status=status, manuscript_file=manuscript_file) def process_word_boxes(page, box_paths, transkription_field, paths=None, attributes=None, max_line=17) -> list: """Process word boxes: partition words according to word boxes. [:return:] a list of paths that are not boxes """ MAX_HEIGHT_LINES = 1 not_boxes = [] try: if not UNITTESTING: bar = Bar('process word boxes', max=len(page.words)) svg_tree = ET.parse(page.source) namespaces = { k if k is not None else 'ns': v for k, v in svg_tree.getroot().nsmap.items() } allpaths_on_margin_field = [] tr_xmin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\ else transkription_field.xmin tr_ymin = 0 if page.svg_image is not None and page.svg_image.text_field is not None\ else transkription_field.ymin if paths is None or attributes is None: paths = [] raw_paths, attributes = svg_to_paths.svg2paths(page.source) for index, raw_path in enumerate(raw_paths): paths.append(Path.create_cls(id=index, path=raw_path, style_class=attributes[index].get('class'), page=page)) for index, mypath in enumerate(paths): path = mypath.path xmin, xmax, ymin, ymax = path.bbox() attribute = attributes[index] if len(path) > 0\ and path != transkription_field.path\ and ((path.bbox()[1] < transkription_field.xmin and transkription_field.is_page_verso())\ or (path.bbox()[0] > transkription_field.xmax and not transkription_field.is_page_verso()))\ and abs(ymax-ymin) < max_line: allpaths_on_margin_field.append(mypath)#Path.create_cls(id=index, path=path, style_class=attribute.get('class'), page=page)) box_line_number_dict = {} for box_path in sorted(box_paths, key=lambda path: path.get_median_y()): line_number = page.get_line_number(box_path.get_median_y(tr_ymin=tr_ymin)) if line_number > 0: if line_number not in box_line_number_dict.keys(): box_line_number_dict.update({ line_number: [ box_path ]}) else: box_line_number_dict.get(line_number).append(box_path) boxes = [] for line_number in box_line_number_dict.keys(): box_paths_on_line = sorted(box_line_number_dict[line_number], key=lambda path: path.get_x()) margin_boxes_on_line = sorted([ margin_box for margin_box in allpaths_on_margin_field\ if page.get_line_number(margin_box.get_median_y(tr_ymin=tr_ymin)) == line_number ],\ key=lambda path: path.get_x()) threshold = 3 if line_number % 2 == 0 else 1.5 if len(margin_boxes_on_line) > 0: for box_path in box_paths_on_line: #print(line_number, box_path.path.d(), len(margin_boxes_on_line)) box = Box.create_box(box_path, margin_boxes_on_line, svg_tree=svg_tree,\ namespaces=namespaces, threshold=threshold) if box is not None: boxes.append(box) else: not_boxes += box_paths_on_line if len(boxes) > 0 and len(page.words) > 0: print(len(boxes)) startIndex = 0 steps = round(len(page.words)/4) if not bool(UNITTESTING) else len(page.words) while startIndex+steps <= len(page.words): for word in page.words[startIndex:startIndex+steps]: word.process_boxes(boxes, tr_xmin=tr_xmin, tr_ymin=tr_ymin) word.create_correction_history(page) if not bool(UNITTESTING): bar.next() elif word.earlier_version is not None: #print(f'{word.text} -> {word.earlier_version.text}') if word.earlier_version.earlier_version is not None: print(f'{word.earlier_version.earlier_version.text}') save_page(page, page.page_tree.docinfo.URL) page = Page.create_cls(page.page_tree.docinfo.URL) startIndex += steps not bool(UNITTESTING) and bar.finish() except Exception as e: print(e) return not_boxes def reset_page(page): """Reset all words that have word_parts in order to run the script a second time. """ svg_pos_file = PathlibPath(page.page_tree.docinfo.URL) first_merge_version = svg_pos_file.parent / MERGED_DIR / svg_pos_file.name if first_merge_version.exists(): page = Page(str(first_merge_version)) else: word_with_wordparts = [ word for word in page.words if len(word.word_parts) > 0 ] word_with_wordparts += [ word for word in page.words if word.earlier_version is not None ] page_changed = False if len(word_with_wordparts) > 0: for word in word_with_wordparts: word.undo_partitioning() update_transkription_position_ids(word) page_changed = True no_line_numbers = [ word for word in page.words if word.line_number == -1 ] if len(no_line_numbers) > 0: for word in no_line_numbers: if len(word.transkription_positions) > 0: word.line_number = page.get_line_number((word.transkription_positions[0].top+word.transkription_positions[0].bottom)/2) else: msg = f'Word {word.id} {word.text} has no transkription_position!' warnings.warn(msg) page_changed = True if page_changed: page.update_and_attach_words2tree() def save_page(page, svg_pos_file, target_svg_pos_file=None, status=None, manuscript_file=None): """Save page to target_file and update status of file. """ page.update_and_attach_words2tree() if not UNITTESTING: if target_svg_pos_file is None: target_svg_pos_file = svg_pos_file if status is not None: update_svgposfile_status(svg_pos_file, manuscript_file=manuscript_file, status=status) write_pretty(xml_element_tree=page.page_tree, file_name=target_svg_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) def update_faksimile_line_positions(page): """Update faksimile_positions of the lines """ num_lines = len(page.line_numbers) ymin = page.text_field.ymin\ if page.text_field is not None\ else 0.0 for line_number in page.line_numbers: if len([ word.faksimile_positions[0] for word in page.words\ - if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) > 0: + if len(word.word_parts) < 2 and len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) > 0: line_number.faksimile_inner_top = min([ word.faksimile_positions[0].top for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) line_number.faksimile_inner_bottom = max([ word.faksimile_positions[0].bottom for word in page.words\ if len(word.faksimile_positions) > 0 and word.line_number == line_number.id ]) if line_number.id % 2 == 0: line_number.faksimile_outer_top = line_number.faksimile_inner_top - ymin line_number.faksimile_outer_bottom = line_number.faksimile_inner_bottom - ymin for index, line_number in enumerate(page.line_numbers): if line_number.faksimile_inner_bottom == 0.0\ or line_number.faksimile_inner_bottom < line_number.faksimile_inner_top: if index == 0 and num_lines > 1: line_number.faksimile_inner_bottom = page.line_numbers[index+1].top elif index == num_lines-1 and page.text_field is not None: line_number.faksimile_inner_bottom = round(page.text_field.height + page.text_field.ymin, 3) elif index > 0 and index < num_lines-1: line_number.faksimile_inner_bottom = page.line_numbers[index+1].faksimile_inner_top\ if page.line_numbers[index+1].faksimile_inner_top > page.line_numbers[index-1].faksimile_inner_bottom\ else page.line_numbers[index-1].faksimile_inner_bottom line_number.attach_object_to_tree(page.page_tree) def update_writing_process_ids(page): """Update the writing_process_ids of the words and split accordingly. """ for word in page.words: word.set_writing_process_id_to_transkription_positions(page) word.partition_according_to_writing_process_id() def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to process words after they have been merged with faksimile data. svgscripts/process_words_post_merging.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help -i|--include-missing-line-number run script on files that contain words without line numbers -r|--rerun rerun script on a svg_pos_file that has already been processed :return: exit code (int) """ status_not_contain = STATUS_POSTMERGED_OK include_missing_line_number = False try: opts, args = getopt.getopt(argv, "hir", ["help", "include-missing-line-number", "rerun" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-i', '--include-missing-line-number'): include_missing_line_number = True elif opt in ('-r', '--rerun'): status_not_contain = '' if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK, status_not_contain=status_not_contain): reset_page(page) no_line_numbers = [ word for word in page.words if word.line_number == -1 ] if not include_missing_line_number and len(no_line_numbers) > 0: not UNITTESTING and print(Fore.RED + f'Page {page.title}, {page.number} has words with no line number!') for word in no_line_numbers: not UNITTESTING and print(f'Word {word.id}: {word.text}') else: back_up(page, page.xml_file) not UNITTESTING and print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL) post_merging_processing_and_saving(page=page, manuscript_file=manuscript_file) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: svgscripts/process_footnotes.py =================================================================== --- svgscripts/process_footnotes.py (revision 110) +++ svgscripts/process_footnotes.py (revision 111) @@ -1,294 +1,294 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This program can be used to process words after they have been merged with faksimile data. """ # Copyright (C) University of Basel 2019 {{{1 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see 1}}} from colorama import Fore, Style import getopt import lxml.etree as ET import os from os import listdir, sep, path, setpgrp, devnull from os.path import exists, isfile, isdir, dirname, basename from pathlib import Path as PathlibPath from progress.bar import Bar import inspect import re import shutil import sys import warnings if dirname(__file__) not in sys.path: sys.path.append(dirname(__file__)) from datatypes.archival_manuscript import ArchivalManuscriptUnity from datatypes.page import Page, STATUS_MERGED_OK, STATUS_POSTMERGED_OK from datatypes.atypical_writing import AtypicalWriting from datatypes.clarification import Clarification from datatypes.editor_comment import EditorComment from datatypes.editor_correction import EditorCorrection from datatypes.footnotes import extract_footnotes from datatypes.imprint import extract_imprints from datatypes.line_continuation import LineContinuation from datatypes.standoff_tag import StandoffTag from datatypes.text import Text from datatypes.text_connection_mark import TextConnectionMark from datatypes.uncertain_decipherment import UncertainDecipherment from util import back_up from process_files import update_svgposfile_status sys.path.append('shared_util') from myxmlwriter import write_pretty, xml_has_type, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT __author__ = "Christian Steiner" __maintainer__ = __author__ __copyright__ = 'University of Basel' __email__ = "christian.steiner@unibas.ch" __status__ = "Development" __license__ = "GPL v3" __version__ = "0.0.1" UNITTESTING = False ATYPICAL_GROUP = re.compile(r'(.*:.*]\s*)(¿)(.*)') CLARIFICATION_GROUP = re.compile(r'(.*:.*]\s*)(Vk)(.*)') CONTINUATION_GROUP = re.compile(r'(.*:\s*)(Fortsetzung\s*)') COMMENT_GROUP = re.compile(r'(.*:.*])') EDITOR_CORRECTION_GROUP = re.compile(r'(.*:.*]\s*)(>[?]*)(.*)') LINE_REFERENCE_GROUP = re.compile(r'(\d+-|\d/(\d+/)*)*([0-9]+)(:.*)') LINE_REFERENCE_GROUP_START_INDEX = 1 LINE_REFERENCE_GROUP_MID_INDEX = 2 LINE_REFERENCE_GROUP_END_INDEX = 3 LINE_COMMENT_GROUP = re.compile(r'(.*\d+:)') UNCERTAINTY_WORD_GROUP = re.compile(r'(.*:.*]\s*)([>]*\?)(.*)') UNCERTAINTY_EDITOR_GROUP = re.compile(r'(.*)(\?)') WORD_REFERENCE_GROUP = re.compile(r'(.*[0-9]+:\s*)(.*)(].*)') DEBUG = False def categorize_footnotes(page, footnotes=None, debug=False, skip_after=-1.0, find_content=False): """Categorize footnotes. """ DEBUG = debug if footnotes is None: footnotes = extract_footnotes(page, skip_after=skip_after) for footnote in footnotes: line_match = re.match(LINE_REFERENCE_GROUP, footnote.content) if line_match is not None: _process_line_match(page, footnote, line_match) else: warnings.warn(f'Unknown editor comment without a line reference: <{footnote}>') if find_content and len(page.text_connection_marks) > 0: TextConnectionMark.find_content_in_footnotes(page, footnotes=footnotes) page.update_and_attach_words2tree() for line in page.lines: line.attach_object_to_tree(page.page_tree) DEBUG = False if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION) def save_imprints(page): """Categorize footnotes. """ for imprint in extract_imprints(page): imprint.attach_object_to_tree(page.page_tree) if not UNITTESTING: write_pretty(xml_element_tree=page.page_tree, file_name=page.page_tree.docinfo.URL,\ script_name=f'{__file__}:{inspect.currentframe().f_back.f_code.co_name}', file_type=FILE_TYPE_SVG_WORD_POSITION) def _is_uncertain(footnote) -> bool: """Return whether footnote contains sign for uncertainty. """ uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content) return (uncertain_match is not None\ and len([ markup for markup in footnote.standoff_markups\ if markup.css_string.endswith('italic;')\ and uncertain_match.end() >= markup.startIndex\ and uncertain_match.end() <= markup.endIndex ]) > 0) def _process_line_match(page, footnote, line_match): """Process footnote if reference to a line matches. """ word_match = re.match(WORD_REFERENCE_GROUP, footnote.content) end_line_number = int(line_match.group(LINE_REFERENCE_GROUP_END_INDEX)) lines = [] if line_match.group(LINE_REFERENCE_GROUP_START_INDEX) is not None: if line_match.group(LINE_REFERENCE_GROUP_MID_INDEX) is not None: line_ids = [ int(line_id) for line_id in\ line_match.group(LINE_REFERENCE_GROUP_START_INDEX).split('/')\ if line_id != '' ] + [ end_line_number ] lines = [ line for line in page.lines if line.id in line_ids ] else: start_line_number = int(line_match.group(1)[0:-1]) lines = [ line for line in page.lines if line.id >= start_line_number and line.id <= end_line_number ] else: lines = [ line for line in page.lines if line.id == end_line_number ] if word_match is not None: _process_word_match(page.words, footnote, line_match, word_match.group(2), end_line_number) elif len(lines) > 0: uncertain_match = re.match(UNCERTAINTY_EDITOR_GROUP, footnote.content) for line in lines: _process_line_reference(page, footnote, line, _is_uncertain(footnote)) else: warnings.warn(f'Footnote refers to missing line {line_number}: {footnote}') def _process_line_reference(page, footnote, line, is_uncertain): """Process footnote if there is a line reference. """ continuation_match = re.match(CONTINUATION_GROUP, footnote.content) if continuation_match is not None: reference_string = footnote.content[continuation_match.end():] if is_uncertain: reference_string = reference_string[:-1] line.editor_comments.append(LineContinuation.create_cls(reference_string=reference_string, is_uncertain=is_uncertain)) else: comment_match = re.match(LINE_COMMENT_GROUP, footnote.content) if comment_match is not None: is_uncertain = _is_uncertain(footnote) comment = footnote.content[comment_match.end():-1].strip()\ if is_uncertain\ else footnote.content[comment_match.end():].strip() line.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain)) else: warnings.warn(f'Unknown editor comment for line "{line.id}": <{footnote}>') def _process_word_match(words, footnote, line_match, word_text, line_number, parent_word_composition=None): """Process footnote if there is a word reference. """ referred_words = [ word for word in words\ if word.line_number == line_number\ and (word.text == word_text\ or re.match(rf'\W*{word_text}\W', word.text)\ or word.edited_text == word_text) ] referred_word_parts = [ word.word_parts for word in words\ if word.line_number == line_number\ and len(word.word_parts) > 0\ and word_text in [ wp.text for wp in word.word_parts ] ] overwritten_word_matches = [ word for word in words\ if word.line_number == line_number\ and len(word.word_parts) > 0\ and len([word_part for word_part in word.word_parts\ if word_part.overwrites_word is not None\ and word_part.overwrites_word.text == word_text]) > 0] if len(referred_words) > 0\ or len(overwritten_word_matches) > 0\ or len(referred_word_parts) > 0: word = None if len(referred_words) == 1: word = referred_words[0] elif len(overwritten_word_matches) > 0: word = [ word_part.overwrites_word for word_part in overwritten_word_matches[0].word_parts\ if word_part.overwrites_word is not None and word_part.overwrites_word.text == word_text][0] elif len(referred_word_parts) > 0: word = [ word_part for word_part in referred_word_parts[0] if word_part.text == word_text ][0] else: word = [ better_word for better_word in referred_words if better_word.text == word_text][0] atypical_match = re.match(ATYPICAL_GROUP, footnote.content) correction_match = re.match(EDITOR_CORRECTION_GROUP, footnote.content) clarification_match = re.match(CLARIFICATION_GROUP, footnote.content) is_uncertain = re.match(UNCERTAINTY_WORD_GROUP, footnote.content) is not None if correction_match is not None: correction = correction_match.group(3).strip() - word.editor_comment = EditorCorrection(correction_text=correction, is_uncertain=is_uncertain) + word.editor_comments.append(EditorCorrection(correction_text=correction, is_uncertain=is_uncertain)) if not is_uncertain: word.edited_text = correction elif clarification_match is not None: - word.editor_comment = Clarification(text=footnote.extract_part(word_text, css_filter='bold;')) + word.editor_comments.append(Clarification(text=footnote.extract_part(word_text, css_filter='bold;'))) elif atypical_match is not None: text = footnote.extract_part(word_text, css_filter='bold;')\ if footnote.markup_contains_css_filter('bold;')\ else None - word.editor_comment = AtypicalWriting(text=text) + word.editor_comments.append(AtypicalWriting(text=text)) elif is_uncertain: - word.editor_comment = UncertainDecipherment() + word.editor_comments.append(UncertainDecipherment()) else: comment_match = re.match(COMMENT_GROUP, footnote.content) if comment_match is not None: is_uncertain = _is_uncertain(footnote) comment = footnote.content[comment_match.end():-1].strip()\ if is_uncertain\ else footnote.content[comment_match.end():].strip() - word.editor_comment = EditorComment(comment=comment, is_uncertain=is_uncertain) + word.editor_comments.append(EditorComment(comment=comment, is_uncertain=is_uncertain)) else: warnings.warn(f'Unknown editor comment for word "{word.text}": <{footnote}>') elif re.match(r'.*\s.*', word_text): for word_part in word_text.split(' '): _process_word_match(words, footnote, line_match, word_part, line_number, parent_word_composition=word_text) elif len([word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]) > 0: new_words = [] for word in [word for word in words if word.line_number == -1 and len(word.word_parts) > 0 ]: new_words += word.word_parts _process_word_match(new_words, footnote, line_match, word_text, line_number) else: warnings.warn(f'No word found with text "{word_text}" on line {line_number}: <{footnote}>') def usage(): """prints information on how to use the script """ print(main.__doc__) def main(argv): """This program can be used to process the footnotes of a page. svgscripts/process_footnotes.py [OPTIONS] a xml file about a manuscript, containing information about its pages. a xml file about a page, containing information about svg word positions. OPTIONS: -h|--help show help -s|--skip-until=left skip all nodes.get('X') < left :return: exit code (int) """ skip_after=-1.0 try: opts, args = getopt.getopt(argv, "hs:", ["help", "skip-until=" ]) except getopt.GetoptError: usage() return 2 for opt, arg in opts: if opt in ('-h', '--help'): usage() return 0 elif opt in ('-s', '--skip-until'): skip_after = float(arg) if len(args) < 1: usage() return 2 exit_status = 0 file_a = args[0] if isfile(file_a): manuscript_file = file_a\ if xml_has_type(FILE_TYPE_XML_MANUSCRIPT, xml_source_file=file_a)\ else None counter = 0 for page in Page.get_pages_from_xml_file(file_a, status_contains=STATUS_MERGED_OK): if not UNITTESTING: print(Fore.CYAN + f'Processing {page.title}, {page.number} ...' + Style.RESET_ALL) back_up(page, page.xml_file) categorize_footnotes(page, skip_after=skip_after, find_content=True) save_imprints(page) counter += 1 not UNITTESTING and print(Style.RESET_ALL + f'[{counter} pages processed]') else: raise FileNotFoundError('File {} does not exist!'.format(file_a)) return exit_status if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Index: tests_svgscripts/test_text_connection_mark.py =================================================================== --- tests_svgscripts/test_text_connection_mark.py (revision 110) +++ tests_svgscripts/test_text_connection_mark.py (revision 111) @@ -1,76 +1,84 @@ import unittest from os import sep, path from os.path import dirname, isdir import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.matrix import Matrix from datatypes.page import Page from datatypes.reference import Reference from datatypes.transkriptionField import TranskriptionField from datatypes.transkription_position import TranskriptionPosition from datatypes.text_connection_mark import TextConnectionMark from datatypes.word import Word class TestTextConnectionMark(unittest.TestCase): def setUp(self): DATADIR = dirname(__file__) + sep + 'test_data' self.xml_file = DATADIR + sep + 'N_VII_1_page008.xml' mylist = {'text': '*', 'id': '0', 'line-number': '2' } self.node = ET.Element(TextConnectionMark.XML_TAG, attrib=mylist) word_position = TranskriptionPosition(x=0, y=1, height=10, width=10, matrix=Matrix('matrix(0.94 0.342 -0.342 0.94 0 0)')) self.transkription_positions = [ word_position ] word_position.attach_object_to_tree(self.node) self.test_tcm_xml = DATADIR + sep + 'N_VII_1_page013.xml' def test_create_cls(self): text_connection_mark = TextConnectionMark.create_cls(self.node) self.assertEqual(text_connection_mark.id, 0) self.assertEqual(text_connection_mark.transkription_positions[0].bottom, 11) self.assertEqual(text_connection_mark.transkription_positions[0].height, 10) self.assertEqual(text_connection_mark.transkription_positions[0].top, 1) self.assertEqual(text_connection_mark.transkription_positions[0].left, 0) self.assertEqual(text_connection_mark.transkription_positions[0].width, 10) self.assertEqual(text_connection_mark.text, '*') self.assertEqual(text_connection_mark.line_number, 2) self.assertEqual(text_connection_mark.transkription_positions[0].transform.isRotationMatrix(), True) + def test_instantiate_as_word(self): + text_source = Reference(first_line=1, title='ASDF', page_number='5c') + text_source.attach_object_to_tree(self.node) + text_connection_mark = TextConnectionMark.instantiate_as_word(self.node) + self.assertEqual(text_connection_mark.id, 0) + self.assertEqual(text_connection_mark.text, '*') + self.assertEqual(text_connection_mark.line_number, 2) + self.assertEqual(len(text_connection_mark.editor_comments), 1) def test_attach_word_to_tree(self): text_connection_mark = TextConnectionMark.create_cls(self.node) text_connection_mark.text_source = Reference(first_line=1, title='ASDF', page_number='5c') empty_tree = ET.ElementTree(ET.Element('page')) text_connection_mark.attach_word_to_tree(empty_tree) #print(ET.dump(empty_tree.getroot())) for node in empty_tree.xpath('//' + TextConnectionMark.XML_TAG): mark = TextConnectionMark.create_cls(node) self.assertEqual(mark.id, 0) self.assertEqual(mark.transkription_positions[0].bottom, 11) self.assertEqual(mark.transkription_positions[0].height, 10) self.assertEqual(mark.transkription_positions[0].top, 1) self.assertEqual(mark.transkription_positions[0].left, 0) self.assertEqual(mark.transkription_positions[0].width, 10) self.assertEqual(mark.text, '*') self.assertEqual(mark.line_number, 2) self.assertEqual(mark.transkription_positions[0].transform.isRotationMatrix(), True) self.assertEqual(mark.text_source.first_line, text_connection_mark.text_source.first_line) self.assertEqual(mark.text_source.page_number, text_connection_mark.text_source.page_number) def test_get_semanticAndDataDict(self): dictionary = TextConnectionMark.get_semantic_dictionary() #print(dictionary) def test_find_content(self): page = Page(self.test_tcm_xml) transkription_field = TranskriptionField(page.source) svg_tree = ET.parse(page.source) page.text_connection_marks = [ TextConnectionMark.create_cls_from_word(word) for word in page.words if word.text == TextConnectionMark.SPECIAL_CHAR_LIST[1]] TextConnectionMark.find_content_in_footnotes(page, transkription_field, svg_tree) self.assertEqual(len(page.text_connection_marks), 4) for tcm in page.text_connection_marks: self.assertEqual(tcm.text_source is not None, True) self.assertEqual(tcm.text_source.first_line > -1, True) self.assertEqual(tcm.text_source.page_number, '14') if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_reference.py =================================================================== --- tests_svgscripts/test_reference.py (revision 110) +++ tests_svgscripts/test_reference.py (revision 111) @@ -1,50 +1,54 @@ import unittest from os import sep, path import lxml.etree as ET import sys sys.path.append('svgscripts') from datatypes.reference import Reference class TestReference(unittest.TestCase): def test_init(self): reference_string = '5' reference = Reference.create_cls(reference_string=reference_string, title='ASDF') self.assertEqual(reference.first_line, 5) reference_string = '5,5' reference = Reference.create_cls(reference_string=reference_string, title='ASDF') self.assertEqual(reference.first_line, 5) self.assertEqual(reference.page_number, str(5)) reference_string = 'ASDF 5,5-8' reference = Reference.create_cls(reference_string=reference_string) self.assertEqual(reference.title, 'ASDF') self.assertEqual(reference.first_line, 5) self.assertEqual(reference.last_line, 8) self.assertEqual(reference.page_number, str(5)) reference_string = 'ASDF 5,5 a .' reference = Reference.create_cls(reference_string=reference_string) self.assertEqual(reference.page_number, str(5)) def test_attach_create(self): reference_string = 'ASDF 5,5-8' reference = Reference.create_cls(is_uncertain=True, reference_string=reference_string) empty_tree = ET.ElementTree(ET.Element('page')) reference.attach_object_to_tree(empty_tree) #print(ET.dump(empty_tree.getroot())) nodes = empty_tree.xpath('//' + Reference.XML_TAG) self.assertEqual(len(nodes), 1) reference_copy = Reference.create_cls(node=nodes[0]) self.assertEqual(reference.id, reference_copy.id) self.assertEqual(reference.is_uncertain, reference_copy.is_uncertain) self.assertEqual(reference.title, reference_copy.title) self.assertEqual(reference.page_number, reference_copy.page_number) self.assertEqual(reference.first_line, reference_copy.first_line) self.assertEqual(reference.last_line, reference_copy.last_line) def test_get_semantic_dictionary(self): dictionary = Reference.get_semantic_dictionary() #print(dictionary) - + + def test_toString(self): + reference_string = 'ASDF 5,5-8' + reference = Reference.create_cls(is_uncertain=True, reference_string=reference_string) + self.assertEqual(reference.toString(), reference_string) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_process_footnotes.py =================================================================== --- tests_svgscripts/test_process_footnotes.py (revision 110) +++ tests_svgscripts/test_process_footnotes.py (revision 111) @@ -1,54 +1,54 @@ import unittest from os import sep, path, remove from os.path import isdir, isfile, dirname import shutil import sys import lxml.etree as ET import warnings import sys sys.path.append('svgscripts') from datatypes.footnotes import extract_footnotes from datatypes.imprint import Imprint from datatypes.page import Page import process_footnotes from process_footnotes import categorize_footnotes, main, save_imprints class TestExtractFootnotes(unittest.TestCase): def setUp(self): process_footnotes.UNITTESTING = True DATADIR = path.dirname(__file__) + sep + 'test_data' self.test_footnote = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.test_footnote_verso = DATADIR + sep + 'N_VII_1_xp5_4_page5.svg' self.test_footnote_recto = DATADIR + sep + 'N_VII_1_xp5_4_page6.svg' self.test_footnote_multi = DATADIR + sep + 'N_VII_1_xp5_4_page13.svg' self.test_footnote_multi_xml = DATADIR + sep + 'N_VII_1_page013.xml' self.test_categorize_footnote = DATADIR + sep + 'N_VII_1_page006.xml' def test_categorize_footnotes(self): page = Page(self.test_categorize_footnote) footnotes = extract_footnotes(page, svg_file=self.test_footnote_recto) categorize_footnotes(page, footnotes) - words_with_comments = [ word for word in page.words if word.editor_comment is not None ] + words_with_comments = [ word for word in page.words if len(word.editor_comments) > 0 ] self.assertEqual(len(words_with_comments), 4) lines_with_comments = [ line for line in page.lines if len(line.editor_comments) > 0 ] self.assertEqual(len(lines_with_comments), 1) page = Page('xml/W_II_1_page141.xml') footnotes = extract_footnotes(page) categorize_footnotes(page, footnotes, debug=True) - words_with_comments = [ word for word in page.words if word.editor_comment is not None ] + words_with_comments = [ word for word in page.words if len(word.editor_comments) > 0 ] def test_save_imprints(self): page = Page(self.test_categorize_footnote) save_imprints(page) self.assertEqual(len(page.page_tree.xpath('//' + Imprint.XML_TAG)), 2) #print(ET.dump(page.page_tree.getroot())) def test_main(self): self.assertEqual(main(['xml/N_VII_1_page005.xml']), 0) if __name__ == "__main__": unittest.main() Index: tests_svgscripts/test_extractWordPosition.py =================================================================== --- tests_svgscripts/test_extractWordPosition.py (revision 110) +++ tests_svgscripts/test_extractWordPosition.py (revision 111) @@ -1,202 +1,236 @@ import unittest import os from os import sep, path from os.path import isfile, isdir, dirname import re import shutil import tempfile import lxml.etree as ET from lxml.etree import XMLSyntaxError import sys sys.path.append('svgscripts') import extractWordPosition from myxmlwriter import write_pretty from datatypes.transkriptionField import TranskriptionField from datatypes.matrix import Matrix from datatypes.page_creator import PageCreator, FILE_TYPE_SVG_WORD_POSITION from datatypes.page import Page +from datatypes.positional_word_part import PositionalWordPart from datatypes.pdf import PDFText from datatypes.word import Word from datatypes.lineNumber import LineNumber from datatypes.word_insertion_mark import WordInsertionMark def test_write(xml_element_tree=None, file_name=None): write_pretty(xml_element_tree=xml_element_tree, file_name=None, script_name='test', file_type=FILE_TYPE_SVG_WORD_POSITION) class TestExtractor(unittest.TestCase): def setUp(self): extractWordPosition.Extractor.UNITTESTING = True DATADIR = dirname(__file__) + sep + 'test_data' self.test_file_find_word = DATADIR + sep + 'test_find_word.xml' self.test_dir = tempfile.mkdtemp() self.title = 'ABC 111' self.matrix_string = 'matrix(1 0 0 1 183.6558 197.9131)' self.test_file = DATADIR + sep + 'Mp_XIV_1_mytest_421.svg' self.test_empty_file = DATADIR + sep + 'my_empty_test.svg' self.test_source = DATADIR + sep + 'Mp_XIV_1_mytest_421.xml' self.xml420 = DATADIR + sep + 'Mp_XIV_1_page420.xml' self.pdf420 = DATADIR + sep + 'Mp_XIV_1_online_420.pdf' self.pdf_file = DATADIR + sep + 'W_I_8_page125.pdf' self.faulty_xml = DATADIR + sep + 'W_I_8_faulty_page125.xml' self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml' self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg' self.testA = DATADIR + sep + 'testA.xml' self.multipage = DATADIR + sep + 'multipage_small_above.svg' def test_extract_information(self): extractor = extractWordPosition.Extractor() page = extractor.extract_information(self.multipage, multipage_index=0) self.assertEqual(len(page.words), 59) self.assertEqual(page.multipage_index, 0) page = extractor.extract_information(self.multipage, multipage_index=1) self.assertEqual(page.multipage_index, 1) self.assertTrue(len(page.words) > 59) extractor = extractWordPosition.Extractor() source_page = Page('xml/Mp_XV_page78v.xml') extractor = extractWordPosition.Extractor() transkription_field = TranskriptionField(source_page.source) svg_tree = ET.parse(source_page.source) text_items = extractor.get_text_items(svg_tree.getroot(), transkription_field=transkription_field) self.assertTrue('matrix(1 0 0 1 115.6299 719.3535)' in [ item.get('transform') for item in text_items ]) page = extractor.extract_information(source_page.source, svg_file=source_page.svg_file) self.assertTrue(page.svg_image.text_field is not None) + ##:map :w:!python3 -m unittest tests_svgscripts.test_extractWordPosition.TestExtractor.test_improved_extract_word_position + @unittest.skip('test with local file') + def test_improved_extract_word_position(self): + extractor = extractWordPosition.Extractor() + source_page = Page('xml/Mp_XV_page85v.xml') + print(len(source_page.words)) + source_page.words = [] + extractor = extractWordPosition.Extractor() + transkription_field = TranskriptionField(source_page.source) + svg_tree = ET.parse(source_page.source) + extractor.improved_extract_word_position(svg_tree, source_page, transkription_field=transkription_field) + print(len(source_page.words)) + #for word in source_page.words: print(word.id, word.text) + def test_update_title(self): extractor = extractWordPosition.Extractor(xml_dir=self.test_dir) extractor.update_title_and_manuscript('test') self.assertEqual(extractor.title, 'test') self.assertEqual(extractor.manuscript_file, '{}/test.xml'.format(self.test_dir)) self.assertEqual(isfile('{}/test.xml'.format(self.test_dir)), True) def test_get_page_number(self): extractor = extractWordPosition.Extractor() self.assertEqual(extractor.get_page_number(self.test_file, page_number='1'), '001') self.assertEqual(extractor.get_page_number(self.test_file), '421') def test_get_file_name(self): extractor = extractWordPosition.Extractor() self.assertEqual(extractor.get_file_name(self.test_file), 'xml/Mp_XIV_1_mytest_421.xml') extractor = extractWordPosition.Extractor(title=self.title) self.assertEqual(extractor.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_'))) extractorA = extractWordPosition.Extractor(title=self.title) extractorB = extractWordPosition.Extractor(manuscript_file=extractorA.manuscript_file) self.assertEqual(extractorB.get_file_name(self.test_file), 'xml/{}_page421.xml'.format(self.title.replace(' ', '_'))) def test_get_style(self): extractor = extractWordPosition.Extractor() svg_tree = ET.parse(self.test_file) sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot()) self.assertEqual(sonderzeichen_list, [ 'st21', 'st23']) self.assertEqual(style_dict.get('st11').get('font-family'), 'Frutiger-Europeen') self.assertEqual(style_dict.get('st5').get('stroke'), '#CED5CE') def test_get_word_from_part_obj(self): extractor = extractWordPosition.Extractor() mylist = [{'text': 'a', 'class': 'asdf' }, {'text': 'b', 'endX': 0 }, {'text': 'c'}] self.assertEqual(extractor.get_word_from_part_obj(mylist), 'abc') def test_get_break_points(self): extractor = extractWordPosition.Extractor() page = Page(self.pdf_xml) page.source = self.pdf_xml_source matrix = Matrix('matrix(1 0 0 1 543.8164 173.9126)') matrixB = Matrix('matrix(1 0 0 1 573.6758 173.9126)') matrixC = Matrix('matrix(1 0 0 1 575.9873 173.9126)') mylist = [{'text': 'es', 'class': 'st5 st6', 'x': matrix.add2X(23.968), 'y': matrix.getY() },\ {'text': 'A', 'class': 'st9 st10', 'x': matrixB.getX(), 'y': matrixB.getY() },\ {'text': 'sich', 'class': "st5 st6", 'x': matrixC.getX(), 'y': matrixC.getY()}] break_points = extractor._get_break_points(page, mylist) self.assertTrue(len(break_points) > 0) + def test_get_pwps_break_points(self): + extractor = extractWordPosition.Extractor() + page = Page(self.pdf_xml) + page.svg_file = "./svg/W_I_8_page125_web.svg" + page.source = self.pdf_xml_source + svg_path_tree = ET.parse(page.svg_file) + namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() } + matrix = Matrix('matrix(1 0 0 1 543.8164 173.9126)') + matrixB = Matrix('matrix(1 0 0 1 573.6758 173.9126)') + matrixC = Matrix('matrix(1 0 0 1 575.9873 173.9126)') + mylist = [{'text': 'es', 'class': 'st5 st6', 'x': matrix.add2X(23.968), 'y': matrix.getY() },\ + {'text': 'A', 'class': 'st9 st10', 'x': matrixB.getX(), 'y': matrixB.getY() },\ + {'text': 'sich', 'class': "st5 st6", 'x': matrixC.getX(), 'y': matrixC.getY()}] + pwps = [] + for word_part_obj in mylist: + pwps += PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces, page=page) + self.assertTrue(len(page.sonderzeichen_list) > 0) + break_points = extractor._get_pwps_break_points(page, pwps) + self.assertTrue(len(break_points) > 0) def test_get_text_items(self): svg_tree = ET.parse(self.test_file) extractor = extractWordPosition.Extractor() mytest_items = [ x for x in extractor.get_text_items(svg_tree.getroot()) ] self.assertEqual(len(mytest_items), 300) self.assertEqual(mytest_items[0].get('transform'), 'matrix(1 0 0 1 386.8218 57.1914)') tf = TranskriptionField(self.test_file) mytest_itemsTF = [ x for x in extractor.get_text_items(svg_tree.getroot(), transkription_field=tf) ] self.assertEqual(mytest_itemsTF[0].get('transform'), 'matrix(1 0 0 1 204.8618 91.7134)') def test_init_tree_and_target_file(self): target_file = self.testA page = PageCreator(target_file, title=self.title) tree = page.page_tree self.assertEqual(tree.getroot().get('title'), self.title) self.assertEqual(tree.getroot().findall('./style'), []) test_write(xml_element_tree=tree, file_name=target_file) page = PageCreator(target_file) tree = page.page_tree self.assertEqual(tree.getroot().get('title'), self.title) self.assertEqual(tree.getroot().findall('./style'), []) isfile(target_file) and os.remove(target_file) def test_add_style(self): extractor = extractWordPosition.Extractor() svg_tree = ET.parse(self.test_file) sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot()) target_file = self.testA page = PageCreator(target_file,title=self.title) page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict) test_write(xml_element_tree=page.page_tree, file_name=target_file) fromTarget_xml_tree = ET.parse(target_file) self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title) self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23") self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE') self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen') page = PageCreator(target_file) page.add_style(sonderzeichen_list=sonderzeichen_list, style_dict=style_dict) test_write(xml_element_tree=page.page_tree, file_name=target_file) fromTarget_xml_tree = ET.parse(target_file) self.assertEqual(fromTarget_xml_tree.getroot().get('title'), self.title) self.assertEqual(fromTarget_xml_tree.getroot().find("style").get('Sonderzeichen'), "st21 st23") self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st5']").get('stroke'), '#CED5CE') self.assertEqual(fromTarget_xml_tree.getroot().find("style").find("class[@name='st11']").get('font-family'), 'Frutiger-Europeen') isfile(target_file) and os.remove(target_file) def test_add_word(self): extractor = extractWordPosition.Extractor() svg_tree = ET.parse(self.test_file) mylist = [{'text': 'a' }, {'text': 'b' }, {'text': 'c' }] matrix = Matrix(self.matrix_string) for dict in mylist: dict['class'] = 'st22' dict['x'] = matrix.add2X(0) dict['y'] = matrix.getY() target_file = self.test_dir + sep + 'asdfasdf.xml' page = PageCreator(target_file) sonderzeichen_list, letterspacing_list, style_dict = extractor.get_style(svg_tree.getroot()) page.add_style(sonderzeichen_list=sonderzeichen_list, letterspacing_list=letterspacing_list, style_dict=style_dict) self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 1) mylist[1]['text'] = 'A' mylist[1]['class'] = 'st21' mylist[1]['x'] = matrix.add2X(1) self.assertEqual(extractor.add_word(page, 0, mylist, '%', 0), 2) page.update_and_attach_words2tree() self.assertEqual(page.page_tree.getroot().xpath('//word[@id="1"]')[0].get('text'), 'a') self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]')[0].get('text'), 'c') self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('left'), '183.506') self.assertEqual(page.page_tree.getroot().xpath('//word[@id="2"]/transkription-position')[0].get('height'), '8.25') def test_extractor(self): extractor = extractWordPosition.Extractor() self.assertEqual(extractor.title, None) self.assertEqual(extractor.manuscript_file, None) self.assertEqual(extractor.xml_dir, 'xml/') self.assertEqual(extractor.manuscript_tree, None) def test_write_title_to_manuscript_file(self): extractor = extractWordPosition.Extractor(xml_dir=self.test_dir, title=self.title) self.assertEqual(isfile(extractor.manuscript_file), True) extractor = extractWordPosition.Extractor(manuscript_file=extractor.manuscript_file) self.assertEqual(extractor.title, self.title) def tearDown(self): isdir(self.test_dir) and shutil.rmtree(self.test_dir) isfile('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) and os.remove('{}/{}.xml'.format('xml', self.title.replace(' ', '_'))) if __name__ == "__main__": unittest.main()