Index: py2ttl/convert.py
===================================================================
--- py2ttl/convert.py (revision 107)
+++ py2ttl/convert.py (revision 108)
@@ -1,115 +1,118 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert py objects to ontology and data in turtle format.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
import getopt
import lxml.etree as ET
from os import sep, path, listdir
from os.path import isfile, isdir, dirname, basename
from progress.bar import Bar
import re
import sys
sys.path.append('svgscripts')
from datatypes.archival_manuscript import ArchivalManuscriptUnity
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from class_spec import SemanticClass
from config import check_config_files_exist, get_datatypes_dir, PROJECT_NAME, PROJECT_ONTOLOGY_FILE, PROJECT_URL
from py2ttl_data import Py2TTLDataConverter
from py2ttl_ontology import Py2TTLOntologyConverter
sys.path.append('shared_util')
from myxmlwriter import xml2dict
-from main_util import get_manuscript_files
+from main_util import get_manuscript_files_and_include_status
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
FILE_TYPE_XML_PROJECT = "xmlProjectFile"
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to convert py objects to a owl:Ontology and rdf data in turtle format.
py2ttl/py2ttl_data.py [OPTIONS] [ ...]
xml file of type shared_util.myxmlwriter.FILE_TYPE_XML_MANUSCRIPT.
OPTIONS:
-h|--help: show help
-i|--include-status=STATUS include pages with status = STATUS. STATUS is a ':' seperated string of status, e.g. 'OK:faksimile merged'.
:return: exit code (int)
"""
check_config_files_exist()
datatypes_dir = get_datatypes_dir()
source_ontology_file = PROJECT_ONTOLOGY_FILE
target_ontology_file = '.{0}{1}-ontology_autogenerated.ttl'.format(sep, PROJECT_NAME)
manuscript_file = None
page_status_list = [ 'OK', 'faksimile merged' ]
try:
opts, args = getopt.getopt(argv, "hi:", ["help", "include-status="])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
elif opt in ('-i', '--include-status'):
page_status_list = arg.split(':')
if len(args) < 1 :
usage()
return 2
ontology_created = False
ontology_converter = Py2TTLOntologyConverter(project_ontology_file=source_ontology_file)
output = 2
- for manuscript_file in get_manuscript_files(args):
+ for manuscript_file, include_status in get_manuscript_files_and_include_status(args):
if not isfile(manuscript_file):
usage()
return 2
if not ontology_created:
print(Fore.CYAN + 'Create ontology from "{}" ...'.format(manuscript_file))
if ontology_converter.create_ontology(datatypes_dir, target_ontology_file) == 0:
print(Fore.GREEN + '[Ontology file {0} created]'.format(target_ontology_file))
ontology_created = True
else:
return 2
- print(Fore.CYAN + 'Create data from "{}" ...'.format(manuscript_file))
+ current_page_status_list = page_status_list\
+ if include_status is None\
+ else include_status.split(':')
+ print(Fore.CYAN + f'Create data from "{manuscript_file}" with status "{current_page_status_list}" ...')
data_converter = Py2TTLDataConverter(manuscript_file, mapping_dictionary=ontology_converter.uri_mapping4cls_and_properties)
- output = data_converter.convert(page_status_list=page_status_list)
+ output = data_converter.convert(page_status_list=current_page_status_list)
return output
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: shared_util/main_util.py
===================================================================
--- shared_util/main_util.py (revision 107)
+++ shared_util/main_util.py (revision 108)
@@ -1,93 +1,103 @@
import lxml.etree as ET
from os.path import isfile, isdir, dirname, basename
from svgpathtools import svg2paths2, svg_to_paths
import sys
sys.path.append('svgscripts')
from datatypes.path import Path
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
FILE_TYPE_XML_PROJECT = "xmlProjectFile"
def create_function_dictionary(list_of_keys, target_function, function_dictionary=None) -> dict:
"""Create a function_dictionary
"""
if function_dictionary is None:
function_dictionary = {}
for key in list_of_keys:
function_dictionary.update({key: target_function})
return function_dictionary
def get_manuscript_files(args: list) ->list:
"""Return a list of manuscript files. If first element is of type FILE_TYPE_XML_PROJECT read from
xml file and return as list of filenames.
"""
if len(args) == 1\
and args[0].endswith('.xml')\
and ET.parse(args[0]).getroot().find('metadata/type').text == FILE_TYPE_XML_PROJECT:
return ET.parse(args[0]).xpath('//manuscript[contains(@status, "OK")]/@file')
return args
+def get_manuscript_files_and_include_status(args: list) ->list:
+ """Return a list tuples of manuscript files and optional include status. If first element is of type FILE_TYPE_XML_PROJECT read from
+ xml file and return as list of tuples of filename (@files) and include status for manuscript pages (@include).
+ """
+ if len(args) == 1\
+ and args[0].endswith('.xml')\
+ and ET.parse(args[0]).getroot().find('metadata/type').text == FILE_TYPE_XML_PROJECT:
+ return [ (node.get('file'),node.get('include')) for node in ET.parse(args[0]).xpath('//manuscript[contains(@status, "OK")]')]
+ return args
+
def extract_paths_on_tf(page, transkription_field=None, new_style_prefix='tln', outsiders=None, outsider_attributes=None) ->list:
"""Extract all paths on transkription_field.
:return: a list of datatypes.path.Path
"""
if page.source is not None and isfile(page.source):
if transkription_field is None:
transkription_field = TranskriptionField(page.source, multipage_index=page.multipage_index)
paths, attributes = svg_to_paths.svg2paths(page.source)
allpaths_on_tf = []
for index, path in enumerate(paths):
attribute = attributes[index]
if len(path) > 0\
and path != transkription_field.path\
and path.bbox()[0] >= transkription_field.xmin\
and path.bbox()[1] <= transkription_field.xmax\
and path.bbox()[2] >= transkription_field.ymin\
and path.bbox()[3] <= transkription_field.ymax:
style_class = attribute.get('class')
if style_class is None and attribute.get('style') is not None:
style_class = create_new_style(page, attribute.get('style'), new_style_prefix=new_style_prefix)
allpaths_on_tf.append(Path.create_cls(id=index, path=path, style_class=style_class, page=page))
elif outsiders is not None\
and len(path) > 0\
and path != transkription_field.path:
style_class = attribute.get('class')
if style_class is None and attribute.get('style') is not None:
style_class = create_new_style(page, attribute.get('style'), new_style_prefix=new_style_prefix)
outsiders.append(Path.create_cls(id=index, path=path, style_class=style_class, page=page))
outsider_attributes.append(attribute)
return allpaths_on_tf
else:
return []
def create_new_style(page, style_attribute_string, new_style_prefix='tln') ->str:
"""Create new style, update page and return new style_class.
"""
style_dict = {}
style_class = None
for key_content in style_attribute_string.split(';'):
if ':' in key_content:
key, content = tuple(key_content.split(':'))
style_dict.update({ key: content})
if style_dict in page.style_dict.values():
style_class = list(page.style_dict.keys())[list(page.style_dict.values()).index(style_dict)]
else:
new_style_index = len([ k for k in page.style_dict.keys() if k.startswith(new_style_prefix) ])
style_class = f'{new_style_prefix}{new_style_index}'
page.style_dict.update({style_class: style_dict })
page.add_style(sonderzeichen_list=page.sonderzeichen_list, letterspacing_list=page.letterspacing_list,\
style_dict=page.style_dict)
return style_class
def get_paths_near_position(tp: TranskriptionPosition, paths: list, xmin=0, ymin=0, do_not_include_d_attributes=None) ->list:
"""Given a transkription position and a list of svgscripts.datatypes.path.Path,
return a list of paths near this position.
"""
tp_x = tp.left + (tp.width/2) + xmin
tp_y = tp.top + (tp.height/2) + ymin
do_not_include_d_attributes = do_not_include_d_attributes if do_not_include_d_attributes is not None else []
return [ path.d_attribute for path in Path.get_nearest_paths(paths, tp_x, tp_y) if path.d_attribute not in do_not_include_d_attributes ]
Index: Friedrich-Nietzsche-late-work-ontology.ttl
===================================================================
--- Friedrich-Nietzsche-late-work-ontology.ttl (revision 107)
+++ Friedrich-Nietzsche-late-work-ontology.ttl (revision 108)
@@ -1,63 +1,143 @@
@prefix dct: .
@prefix document: .
@prefix homotypic: .
@prefix stoff: .
@prefix text: .
@prefix owl: .
@prefix rdfs: .
+@prefix rdf: .
@prefix xsd: .
@prefix tln: .
a owl:Ontology;
dct:license ;
dct:title "An ontology about the collected late works of Friedrich Nietzsche"@en;
dct:description """Formal description of specific concepts in the scientific study of Friedrich Nietzsches late work."""@en;
dct:creator "Dominique Steinbach, tool coordinator/software developer, NIE-INE/digital edition of der späte Nietzsche, Basel University, Switzerland"@en;
dct:contributor "Christian Steiner, software developer, digital edition of der späte Nietzsche, University of Basel, Switzerland"@en;
dct:publisher "Basel University, Switzerland"@en.
+tln:TextGenesis a owl:Class ;
+ rdfs:label "identifies a genetic order of text versions"@en ;
+ rdfs:comment "Identifies a genetic order of text versions, i.e. groups text units as earlier and later versions of each other."@en ;
+ rdfs:isDefinedBy .
+
+tln:IdentifiedTextVersion a owl:Class ;
+ rdfs:label "identifies a list of text unities as a text version"@en ;
+ rdfs:comment "Identification of a list of text unities (e.g. pages or parts of pages) as a text version for which there is an earlier or later version."@en ;
+ rdfs:isDefinedBy .
+
+tln:PartOfPageTextUnit a owl:Class ;
+ rdfs:label "identifies a part of a page as a text unity"@en ;
+ rdfs:comment "Identification of a part of page as a text unity."@en ;
+ rdfs:isDefinedBy ;
+ rdfs:subClassOf [ a owl:Restriction ;
+ owl:cardinality "1"^^xsd:nonNegativeInteger ;
+ owl:onProperty tln:belongsToPage ],
+ [ a owl:Restriction ;
+ owl:cardinality "1"^^xsd:nonNegativeInteger ;
+ owl:onProperty tln:startLine ],
+ [ a owl:Restriction ;
+ owl:cardinality "1"^^xsd:nonNegativeInteger ;
+ owl:onProperty tln:endLine ] .
+
+tln:ExternalTextUnit a owl:Class ;
+ rdfs:label "a list text unit that has been published external to the digital edition"@en ;
+ rdfs:comment "A text unit that has been published external to the digital edition."@en ;
+ rdfs:isDefinedBy ;
+ rdfs:subClassOf tln:IdentifiedTextVersion .
tln:Page a owl:Class ;
rdfs:subClassOf document:Page .
+tln:belongsToPage a owl:ObjectProperty ;
+ rdfs:label "relates a part of a page with the page it is a part of"@en ;
+ rdfs:comment "Relates a part of a page with the page it is a part of."@en ;
+ rdfs:isDefinedBy ;
+ rdfs:domain tln:PartOfPageTextUnit ;
+ rdfs:range tln:Page.
+
+tln:startLine a owl:ObjectProperty ;
+ rdfs:label "relates a part of a page with the line it starts with"@en ;
+ rdfs:comment "Relates a part of a page with the line it starts with."@en ;
+ rdfs:isDefinedBy ;
+ rdfs:domain tln:PartOfPageTextUnit ;
+ rdfs:range tln:Line.
+
+tln:endLine a owl:ObjectProperty ;
+ rdfs:label "relates a part of a page with the line it ends with"@en ;
+ rdfs:comment "Relates a part of a page with the line it ends with."@en ;
+ rdfs:isDefinedBy ;
+ rdfs:domain tln:PartOfPageTextUnit ;
+ rdfs:range tln:Line.
+
+tln:identifiesAsVersion a owl:ObjectProperty ;
+ rdfs:label "groups a list of text unities together as a identified text version"@en ;
+ rdfs:comment "Groups a list of text unities together as a identified text version for which there is an ealier or later version."@en ;
+ rdfs:isDefinedBy ;
+ rdfs:domain tln:IdentifiedTextVersion ;
+ rdfs:range rdf:List.
+
+tln:hasGeneticOrder a owl:ObjectProperty ;
+ rdfs:label "relates a list of text versions to an identified genetic order"@en ;
+ rdfs:comment "Relates a list of text versions to an identified genetic order. The position in the list determines the version of a text unit."@en ;
+ rdfs:isDefinedBy ;
+ rdfs:domain tln:TextGenesis ;
+ rdfs:range rdf:List.
+
+tln:textUnitHasTitle a owl:ObjectProperty ;
+ rdfs:label "relates a external published text unit with a title"@en ;
+ rdfs:comment "Relates a external published text unit with a title by which it can be identified."@en ;
+ rdfs:isDefinedBy ;
+ rdfs:domain tln:ExternalTextUnit ;
+ rdfs:range xsd:string .
+
+tln:textUnitHasUrl a owl:ObjectProperty ;
+ rdfs:label "relates a external published text unit with a URL"@en ;
+ rdfs:comment "Relates a external published text unit with a URL by which it can be visited."@en ;
+ rdfs:isDefinedBy ;
+ rdfs:domain tln:ExternalTextUnit ;
+ rdfs:range xsd:anyURI .
+
tln:hasImage a owl:ObjectProperty ;
rdfs:label "relates a page to a image"@en ;
rdfs:comment "relates a page to an image that has a textfield that specifies the area where the writing that constitutes the page can be found."@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:Page ;
rdfs:range tln:Image .
tln:hasUrl a owl:DatatypeProperty ;
rdfs:label "has Url"@en ;
rdfs:domain tln:Image ;
rdfs:isDefinedBy ;
rdfs:range xsd:anyURI .
tln:inheritOverwritesWord a owl:ObjectProperty ;
rdfs:subPropertyOf tln:overwritesWord;
rdfs:label "word overwrites word (inherited from tln:wordHasCorrection)"@en ;
rdfs:comment "The author has used this word in order to overwrite that word."@en ;
rdfs:isDefinedBy ;
owl:propertyChainAxiom ( tln:wordHasCorrection tln:overwritesWord ).
tln:lineContinuesOn a owl:ObjectProperty ;
rdfs:label "writing from subject line continues on object line"@en ;
rdfs:comment "the writing that ends on subject line continues on object line"@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:Line ;
rdfs:range tln:Line .
tln:pageIsOnTextField a owl:ObjectProperty ;
rdfs:label "page is on text field"@en ;
rdfs:comment "the writing that is referred to as subject can be found on object"@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:Page ;
rdfs:range tln:TextField .
tln:writingContinuesWithWord a owl:ObjectProperty ;
rdfs:label "writing continues with next word"@en ;
rdfs:isDefinedBy ;
rdfs:domain tln:Word ;
rdfs:range tln:Word .
+
Index: svgscripts/fix_missing_glyphs.py
===================================================================
--- svgscripts/fix_missing_glyphs.py (revision 107)
+++ svgscripts/fix_missing_glyphs.py (revision 108)
@@ -1,210 +1,213 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to fix missing glyphs.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
from colorama import Fore, Style
import getopt
import re
import sys
from os import listdir, sep, path
from os.path import isfile, isdir, dirname
import lxml.etree as ET
if dirname(__file__) not in sys.path:
sys.path.append(dirname(__file__))
from datatypes.page import Page, FILE_TYPE_SVG_WORD_POSITION, FILE_TYPE_XML_MANUSCRIPT
from datatypes.positional_word_part import PositionalWordPart
from datatypes.transkriptionField import TranskriptionField
from datatypes.transkription_position import TranskriptionPosition
from datatypes.word import Word
from util import update_svgposfile_status
sys.path.append('shared_util')
from myxmlwriter import write_pretty
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
UNITTESTING = False
REMOVE_SVG_WORD_POS_PAGE_ENDING = re.compile('_page[0-9]+\w*')
def find_missing_glyph_for_pwp(pwp, svg_path_tree, namespaces, xmin=0.0, ymin=0.0):
"""Finds missing glyph for a PositionalWordPart.
:return: list of PositionalWordPart
"""
THRESHOLD = 15.5
#pwp = PositionalWordPart(node=positional_word_part_node)
word_part_obj = { "x": pwp.left, "y": pwp.top, "text": pwp.text, "matrix": pwp.transform, "class": pwp.style_class }
start_id = int(pwp.id)
threshold = -0.5
positional_word_parts = []
while threshold < THRESHOLD and len(positional_word_parts) < 1:
try:
positional_word_parts = PositionalWordPart.CREATE_POSITIONAL_WORD_PART_LIST(word_part_obj, svg_path_tree, namespaces,\
start_id=start_id, xmin=xmin, ymin=ymin, threshold=threshold, throw_error_if_not_found=True)
except Exception:
threshold += 0.1
return positional_word_parts
def update_word(word, old_transkription_position, old_positional_word_part, positional_word_parts):
"""Updates word according to new positional_word_parts.
:return: new transkription_position
"""
if len(positional_word_parts) > 0:
debug_msg_string = 'update word from ' + __file__
old_transkription_position.positional_word_parts.remove(old_positional_word_part)
positional_word_parts.reverse()
for positional_word_part in positional_word_parts:
old_transkription_position.positional_word_parts.insert(int(old_positional_word_part.id), positional_word_part)
for index, positional_word_part in enumerate(old_transkription_position.positional_word_parts):
positional_word_part.id = index
transkription_positions = TranskriptionPosition.CREATE_TRANSKRIPTION_POSITION_LIST_FROM_PWPS(\
old_transkription_position.positional_word_parts, debug_msg_string=debug_msg_string, transkription_position_id=old_transkription_position.id)
word.transkription_positions.remove(old_transkription_position)
transkription_positions.reverse()
for new_tp in transkription_positions:
word.transkription_positions.insert(int(old_transkription_position.id), new_tp)
text = ''
for index, tp in enumerate(word.transkription_positions):
tp.id = index
tp.writing_process_id = old_transkription_position.writing_process_id
for pwp in tp.positional_word_parts:
text += pwp.text
if word.text != text:
word.text = text
return transkription_positions[0]
def fix_missing_glyphs(svg_word_pos_file, manuscript_file=None):
"""Finds missing glyphs for xml file of type FILE_TYPE_SVG_WORD_POSITION.
"""
if isfile(svg_word_pos_file):
if not UNITTESTING:
print(Fore.LIGHTBLUE_EX + 'Fixing missing glyphs for file {} ... '.format(svg_word_pos_file), end='')
print(Style.RESET_ALL)
page = Page(svg_word_pos_file)
xmin = 0
ymin = 0
if page.svg_image is None or page.svg_image.text_field is None:
transkription_field = TranskriptionField(page.svg_file)
xmin = transkription_field.xmin
ymin = transkription_field.ymin
svg_path_tree = ET.parse(page.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]'))
words_without_glyphs = [ word for word in page.words\
if len([ tp for tp in word.transkription_positions\
if len([ pwp for pwp in tp.positional_word_parts if pwp.symbol_id is None]) > 0]) > 0 ]
for word in words_without_glyphs:
for transkription_position in word.transkription_positions:
positional_word_parts = transkription_position.positional_word_parts[:]
for positional_word_part in positional_word_parts:
if positional_word_part.symbol_id is None:
pwps = find_missing_glyph_for_pwp(positional_word_part, svg_path_tree, namespaces, xmin=xmin, ymin=ymin)
new_transkription_position = update_word(word, transkription_position, positional_word_part, pwps)
if new_transkription_position is not None:
transkription_position = new_transkription_position
page.update_and_attach_words2tree()
write_pretty(xml_element_tree=page.page_tree, file_name=svg_word_pos_file, script_name=__file__, file_type=FILE_TYPE_SVG_WORD_POSITION)
page = Page(svg_word_pos_file)
new_number_of_missing_glyphs = len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]'))
if not UNITTESTING:
result_color = Fore.LIGHTBLUE_EX if new_number_of_missing_glyphs == 0 else Fore.MAGENTA
print(result_color + ' {0}/{1}'.format(number_of_missing_glyphs-new_number_of_missing_glyphs, number_of_missing_glyphs), end='')
print(Fore.LIGHTBLUE_EX + ' fixed.', end='')
print(Style.RESET_ALL)
if len(page.page_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')) == 0:
update_svgposfile_status(svg_word_pos_file, manuscript_file=manuscript_file, status='OK')
def get_filelist_and_manuscript_file(file_a, file_b=None):
"""Returns a file list and a manuscript file (or None)
"""
file_list = []
manuscript_file = None
source_tree = ET.parse(file_a)
if source_tree.getroot().find('metadata/type').text == FILE_TYPE_SVG_WORD_POSITION\
and len([ word_part for word_part in source_tree.xpath('//' + PositionalWordPart.XML_TAG + '[not(@symbol-id)]')]) > 0: # if symbol_ids are missing ...
file_list.append(file_a)
if file_b is not None:
manuscript_file = file_b
else:
manuscript_file = REMOVE_SVG_WORD_POS_PAGE_ENDING.sub('', file_a)
if not isfile(manuscript_file):
manuscript_file = None
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
manuscript_file = file_a
if file_b is not None:
file_list.append(file_b)
else:
file_list = source_tree.xpath('//page[contains(@status, "{}")]/@output'.format(PositionalWordPart.WARN_NO_USE_NODE_FOUND.lower()))
+ if len(file_list) == 0:
+ file_list = source_tree.xpath('//page[contains(@status, "{}")]/@output'.format(PositionalWordPart.WARN_NO_USE_NODE_FOUND))
return file_list, manuscript_file
def usage():
"""prints information on how to use the script
"""
print(main.__doc__)
def main(argv):
"""This program can be used to fix missing glyphs.
svgscripts/fix_missing_glyphs.py [OPTIONS] -File [-File]
a xml file about a manuscript, containing information about its pages.
a xml file about a page, containing information about svg word positions.
OPTIONS:
-h|--help: show help
:return: exit code (int)
"""
try:
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
usage()
return 2
for opt, arg in opts:
if opt in ('-h', '--help'):
usage()
return 0
if len(args) < 1:
usage()
return 2
exit_status = 0
file_a = args[0]
if isfile(file_a):
file_b = None
if len(args) > 1 and isfile(args[1]):
file_b = args[1]
file_list, manuscript_file = get_filelist_and_manuscript_file(file_a, file_b=file_b)
for svg_word_pos_file in file_list:
+ print(f'{svg_word_pos_file}')
fix_missing_glyphs(svg_word_pos_file, manuscript_file=manuscript_file)
else:
raise FileNotFoundError('File {} does not exist!'.format(file_a))
return exit_status
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Index: svgscripts/datatypes/reconstructed_konvolut.py
===================================================================
--- svgscripts/datatypes/reconstructed_konvolut.py (revision 107)
+++ svgscripts/datatypes/reconstructed_konvolut.py (revision 108)
@@ -1,153 +1,154 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a reconstruction of an original manuscript (e.g. a workbook or notebook).
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import abc
from lxml import etree as ET
from os.path import isfile
import requests
import sys
from .description import Description
from .faksimile_image import FaksimileImage
from .manuscript import ManuscriptUnity
from .page import Page, FILE_TYPE_XML_MANUSCRIPT, FILE_TYPE_SVG_WORD_POSITION
sys.path.append('shared_util')
from myxmlwriter import parse_xml_of_type, write_pretty, xml_has_type
class NonExistentPage(Page):
"""This class represents a page that does not exist as part of the KGW edition.
@label non existent page
"""
NIETZSCHE_SOURCES_URL = 'http://www.nietzschesource.org/DFGAapi/api/fe/facsimile/'
def __init__(self, number=None, faksimile_image=None, status=None):
+ self.page_tree = None
self.number = number
self.status = status
self.faksimile_image = faksimile_image
@classmethod
def create_cls(cls, page_node, faksimile_image=None):
"""
Create an instance of NonExistentPage from a page_node
:return: NonExistentPage
"""
number = page_node.get('title') + '_' + page_node.get('number')\
if bool(page_node.get('title'))\
else page_node.get('number')
return cls(number=number, status=page_node.get('status'), faksimile_image=faksimile_image)
def get_name_and_id(self):
"""Return an identification for object as 2-tuple.
"""
return type(self).__name__, self.number.replace(' ', '_')
@classmethod
def get_semantic_dictionary(cls):
""" Creates and returns a semantic dictionary as specified by SemanticClass.
"""
dictionary = super(NonExistentPage,cls).get_semantic_dictionary()
dictionary[cls.PROPERTIES_KEY].update(cls.create_semantic_property_dictionary('status', str))
return cls.return_dictionary_after_updating_super_classes(dictionary)
class ReconstructedKonvolut(ManuscriptUnity):
"""
This class represents a reconstruction of an original manuscript (e.g. a workbook or notebook).
@label reconstruction of an origianl manuscript
Args:
title title for identification of the reconstruction
manuscript_type type of manuscript: 'Arbeitsheft' or 'Notizheft'
manuscript_tree lxml.ElementTree
"""
XML_TAG = 'reconstructed-konvolut'
TYPE_DICTIONARY = { 'R_n': 'Notizheft', 'R_w': 'Arbeitsheft' }
UNITTESTING = False
def __init__(self, title='', manuscript_type='', manuscript_tree=None):
super(ReconstructedKonvolut,self).__init__(title=title, manuscript_type=manuscript_type,manuscript_tree=manuscript_tree)
@classmethod
def create_cls(cls, xml_manuscript_file, page_status_list=None, page_xpath=''):
"""Create an instance of ReconstructedKonvolut from a xml file of type FILE_TYPE_XML_MANUSCRIPT.
:return: ReconstructedKonvolut
"""
manuscript = super(ReconstructedKonvolut,cls).create_cls(xml_manuscript_file)
manuscript_tree = manuscript.manuscript_tree
if page_xpath == '':
page_status = ''
if page_status_list is not None\
and type(page_status_list) is list\
and len(page_status_list) > 0:
page_status = '[' + ' and '.join([ f'contains(@status, "{status}")' for status in page_status_list ]) + ']'
page_xpath = f'//pages/page{page_status}/@output'
included_page_list = [ page_source\
for page_source in manuscript_tree.xpath(page_xpath)\
if isfile(page_source) and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_source) ]
for page_node in manuscript_tree.xpath('//pages/page'):
if bool(page_node.get('output'))\
and isfile(page_node.get('output'))\
and xml_has_type(FILE_TYPE_SVG_WORD_POSITION, xml_source_file=page_node.get('output')):
manuscript.pages.append(Page.create_cls(\
page_node.get('output'), create_dummy_page=(page_node.get('output') not in included_page_list)))
else:
faksimile_image = get_or_update_faksimile(xml_manuscript_file, page_node)
manuscript.pages.append(NonExistentPage.create_cls(page_node, faksimile_image))
manuscript.description = Description.create_cls_from_node(manuscript_tree.xpath(Description.XML_TAG)[0])\
if len(manuscript_tree.xpath(Description.XML_TAG)) > 0\
else None
return manuscript
def get_or_update_faksimile(xml_source_file, page_node) ->FaksimileImage:
"""Return the faksimile image of the non existent page.
"""
faksimile_image = None
if len(page_node.xpath(f'./{FaksimileImage.XML_TAG}')) > 0:
faksimile_image = FaksimileImage(node=page_node.xpath(f'./{FaksimileImage.XML_TAG}')[0])
elif bool(page_node.get('alias')):
url = NonExistentPage.NIETZSCHE_SOURCES_URL + page_node.get('alias')
faksimile_dict = None
try:
r = requests.get(url)
faksimile_dict = r.json()
except Exception:
print(f'URL does not work: {url}')
if faksimile_dict is not None and len(faksimile_dict) > 0:
width = faksimile_dict['imageWidth']
height = faksimile_dict['imageHeight']
file_name = page_node.get('alias') + '.jpg'
URL = FaksimileImage.NIETZSCHE_SOURCES_URL + page_node.get('alias')
faksimile_image = FaksimileImage(file_name=file_name, URL=URL, height=height, width=width)
faksimile_image.attach_object_to_tree(page_node)
write_pretty(xml_element_tree=page_node.getroottree(), file_name=xml_source_file, script_name=__file__,\
file_type=FILE_TYPE_XML_MANUSCRIPT, backup=True)
return faksimile_image
Index: svgscripts/datatypes/page.py
===================================================================
--- svgscripts/datatypes/page.py (revision 107)
+++ svgscripts/datatypes/page.py (revision 108)
@@ -1,406 +1,428 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a page.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
from os.path import isfile, basename
from progress.bar import Bar
from svgpathtools import svg2paths2, svg_to_paths
from svgpathtools.parser import parse_path
import re
import sys
import warnings
from .box import Box
from .color import Color
from .image import Image, SVGImage
from .faksimile_image import FaksimileImage
from .faksimile_position import FaksimilePosition
from .lineNumber import LineNumber
from .line import Line
from .mark_foreign_hands import MarkForeignHands
from .matrix import Matrix
from .path import Path
from .positional_word_part import PositionalWordPart
from .super_page import SuperPage
from .style import Style
from .text_connection_mark import TextConnectionMark
from .text_field import TextField
from .transkriptionField import TranskriptionField
from .writing_process import WritingProcess
from .word import Word
from .word_deletion_path import WordDeletionPath
from .word_insertion_mark import WordInsertionMark
sys.path.append('py2ttl')
from class_spec import SemanticClass
sys.path.append('shared_util')
from main_util import extract_paths_on_tf, get_paths_near_position
FILE_TYPE_SVG_WORD_POSITION = SuperPage.FILE_TYPE_SVG_WORD_POSITION
FILE_TYPE_XML_MANUSCRIPT = SuperPage.FILE_TYPE_XML_MANUSCRIPT
STATUS_MERGED_OK = SuperPage.STATUS_MERGED_OK
STATUS_POSTMERGED_OK = SuperPage.STATUS_POSTMERGED_OK
class Page(SemanticClass,SuperPage):
"""
This class represents a page.
Args:
xml_source_file (str): name of the xml file to be instantiated.
faksimile_image: FaksimileImage.
faksimile_svgFile: svg file containing information about word positions.
"""
UNITTESTING = False
def __init__(self, xml_source_file=None, faksimile_image=None, faksimile_svgFile=None, add_paths_near_words=False, warn=False, number=None):
if xml_source_file is not None:
super(Page,self).__init__(xml_source_file)
self.update_property_dictionary('faksimile_image', faksimile_image)
self.update_property_dictionary('faksimile_svgFile', faksimile_svgFile)
self.init_all_properties()
self.add_style(style_node=self.page_tree.getroot().find('.//style'))
self.faksimile_text_field = None
self.svg_text_field = None
self.init_node_objects()
self.warn = warn
self.add_deletion_paths_to_words(add_paths_near_words)
else:
+ self.page_tree = None
self.number = number
def add_deletion_paths_to_words(self, add_paths_near_words=False):
"""Add deletion paths to words.
"""
words = [ word for word in self.words if (len(word.word_parts) == 0 and word.deleted and len(word.deletion_paths) == 0)\
or 'add_paths_near_words' in word.process_flags ]
words += [ word for word in self.words\
if len(word.word_parts) > 0 and True in\
[ (wp.deleted and len(wp.deletion_paths) == 0) for wp in word.word_parts ]]
if len(words) > 0 and ((self.svg_file is not None and isfile(self.svg_file))\
or (self.source is not None and isfile(self.source))):
svg_file = self.svg_file if self.svg_file is not None else self.source
transkription_field = TranskriptionField(svg_file)
tr_xmin = transkription_field.xmin if (self.svg_image is None or self.svg_image.text_field is None) else 0
tr_ymin = transkription_field.ymin if (self.svg_image is None or self.svg_image.text_field is None) else 0
word_deletion_paths = self.word_deletion_paths
index = 0
dp_updated = False
while index < len(words):
word = words[index]
word.add_deletion_paths(word_deletion_paths, tr_xmin=tr_xmin, tr_ymin=tr_ymin)
if len(word.deletion_paths) > 0 or True in [ len(w.deletion_paths) > 0 for w in word.word_parts ]:
deletion_paths = word.deletion_paths
for wp in word.word_parts: deletion_paths += wp.deletion_paths
for deletion_path in deletion_paths:
if deletion_path not in self.word_deletion_paths:
self.word_deletion_paths.append(deletion_path)
elif not dp_updated:
word_deletion_paths = extract_paths_on_tf(self)
dp_updated = True
index -= 1
if add_paths_near_words\
and ('add_paths_near_words' in word.process_flags\
or ((word.deleted and len(word.deletion_paths) == 0)\
or True in [ (w.deleted and len(w.deletion_paths) == 0) for w in word.word_parts ])):
if not dp_updated\
and 'add_paths_near_words' in word.process_flags:
word_deletion_paths = extract_paths_on_tf(self)
dp_updated = True
transform = None
tp = None
target_word = word
paths_near_word = []
if word.deleted and len(word.transkription_positions) > 0:
transform = word.transkription_positions[0].transform
for tp in word.transkription_positions:
word.deletion_paths_near_word += get_paths_near_position(tp, word_deletion_paths)
elif len(word.word_parts) > 0:
for wp in word.word_parts:
if wp.deleted and len(wp.transkription_positions) > 0:
target_word = wp
for tp in wp.transkription_positions:
wp.deletion_paths_near_word = get_paths_near_position(tp, word_deletion_paths)
if self.warn and (word.deleted and len(word.deletion_paths) == 0):
warnings.warn(\
f'WARNING: {self.title} {self.number}: {word.id} on {word.line_number}, {word.text} has no deletion paths! {target_word.deletion_paths_near_word}, {transform}')
index += 1
@classmethod
def create_cls(cls, xml_source_file=None, create_dummy_page=False, page_node=None):
"""Create a Page.
"""
if not create_dummy_page:
return cls(xml_source_file)
else:
m = re.match(r'(.*)(page[0]*)(.*)(\.xml)', xml_source_file)
if m is not None and len(m.groups()) > 3:
number = m.group(3)
else:
number = basename(xml_source_file).replace('.xml','')
return cls(number=number)
@classmethod
def get_pages_from_xml_file(cls, xml_file, status_contains='', status_not_contain='', word_selection_function=None):
"""Returns a list of Page instantiating a xml_file of type FILE_TYPE_SVG_WORD_POSITION
or xml_files contained in xml_file of type FILE_TYPE_XML_MANUSCRIPT.
[optional: instantiation depends on the fulfilment of a status_contains
and/or on the selection of some words by a word_selection_function].
"""
source_tree = ET.parse(xml_file)
if source_tree.getroot().find('metadata/type').text == cls.FILE_TYPE_SVG_WORD_POSITION:
page = cls(xml_file)
if word_selection_function is None or len(word_selection_function(page.words)) > 0:
return [ page ]
else:
return []
elif source_tree.getroot().find('metadata/type').text == FILE_TYPE_XML_MANUSCRIPT:
pages = []
xpath = '//page/@output'
if status_contains != '' and status_not_contain != '':
xpath = '//page[contains(@status, "{0}") and not(contains(@status, "{1}"))]/@output'.format(status_contains, status_not_contain)
elif status_contains != '':
xpath = '//page[contains(@status, "{0}")]/@output'.format(status_contains)
elif status_not_contain != '':
xpath = '//page[not(contains(@status, "{0}"))]/@output'.format(status_not_contain)
for xml_source_file in source_tree.xpath(xpath):
if isfile(xml_source_file):
pages += cls.get_pages_from_xml_file(xml_source_file, word_selection_function=word_selection_function)
return pages
else:
return []
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
dictionary = {}
class_dict = cls.get_class_dictionary()
properties = { 'number': { 'class': str, 'cardinality': 1}}
properties.update(cls.create_semantic_property_dictionary('faksimile_image', FaksimileImage, subPropertyOf=cls.HAS_IMAGE))
properties.update(cls.create_semantic_property_dictionary('faksimile_text_field', TextField,\
name='pageIsOnFaksimileTextField', label='page is on faksimile text field',\
comment='Relates a page to the text field on a svg image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD))
properties.update(cls.create_semantic_property_dictionary('orientation', str))
properties.update(cls.create_semantic_property_dictionary('svg_image', SVGImage, subPropertyOf=cls.HAS_IMAGE))
properties.update(cls.create_semantic_property_dictionary('svg_text_field', TextField,\
name='pageIsOnSVGTextField', label='page is on svg text field',\
comment='Relates a page to the text field on a faksimile image.', subPropertyOf=cls.PAGE_IS_ON_TEXTFIELD))
for key in [ 'lines', 'mark_foreign_hands', 'words', 'word_deletion_paths', 'word_insertion_marks']:
properties.update(cls.create_semantic_property_dictionary(key, list))
dictionary.update({cls.CLASS_KEY: class_dict})
dictionary.update({cls.PROPERTIES_KEY: properties})
return cls.return_dictionary_after_updating_super_classes(dictionary)
def get_word_deletion_path(self, path=None, d_attribute=None) ->WordDeletionPath:
"""Return a word deletion path that belongs to page.
"""
if path is None and d_attribute is None:
raise Exception('ERROR: get_word_deletion_path needs a path or a d_attribute!')
if d_attribute is None:
d_attribute = path.d_attribute
page_paths = [ dpath for dpath in self.word_deletion_paths if dpath.d_attribute == d_attribute ]
if len(page_paths) > 0:
return page_paths[0]
else:
dpath = WordDeletionPath.create_cls(self, path=path, d_attribute=d_attribute)
if dpath is not None:
dpath.id = len(self.word_deletion_paths)
self.word_deletion_paths.append(dpath)
dpath.attach_object_to_tree(self.page_tree)
return dpath
def init_node_objects(self):
"""Initialize all node objects.
"""
self.word_insertion_marks = [ WordInsertionMark(wim_node=wim_node) for wim_node in self.page_tree.getroot().xpath('//' + WordInsertionMark.XML_TAG) ]
self.words = [ Word.create_cls(word_node) for word_node in self.page_tree.getroot().xpath('./word') ]
self.mark_foreign_hands = [ MarkForeignHands.create_cls(node) for node in self.page_tree.getroot().xpath('//' + MarkForeignHands.XML_TAG) ]
self.text_connection_marks = [ TextConnectionMark.create_cls(node) for node in self.page_tree.getroot().xpath('//' + TextConnectionMark.XML_TAG) ]
self.line_numbers = [ LineNumber(xml_text_node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.lines = [ Line.create_cls_from_node(node=line_number_node) for line_number_node in self.page_tree.getroot().xpath('//' + LineNumber.XML_TAG) ]
self.writing_processes = [ WritingProcess.create_writing_process_from_xml(node, self.words) for node in self.page_tree.xpath('//' + WritingProcess.XML_TAG) ]
self.word_deletion_paths = [ WordDeletionPath.create_cls(self, node=node) for node in self.page_tree.xpath('./' + WordDeletionPath.XML_TAG) ]
if self.faksimile_image is not None and self.faksimile_image.text_field is not None:
self.faksimile_text_field = self.faksimile_image.text_field
if self.svg_image is not None and self.svg_image.text_field is not None:
self.svg_text_field = self.svg_image.text_field
for simple_word in self.words + self.mark_foreign_hands + self.text_connection_marks:
simple_word.init_word(self)
for wim in self.word_insertion_marks:
if wim.line_number > -1:
wim.line = [ line for line in self.lines if line.id == wim.line_number ][0]
def update_and_attach_words2tree(self, update_function_on_word=None, include_special_words_of_type=[]):
"""Update word ids and attach them to page.page_tree.
"""
if not self.is_locked():
update_function_on_word = [ update_function_on_word ]\
if type(update_function_on_word) != list\
else update_function_on_word
for node in self.page_tree.xpath('.//word|.//' + MarkForeignHands.XML_TAG + '|.//' + TextConnectionMark.XML_TAG):
node.getparent().remove(node)
for index, word in enumerate(self.words):
word.id = index
for func in update_function_on_word:
if callable(func):
func(word)
word.attach_word_to_tree(self.page_tree)
for index, mark_foreign_hands in enumerate(self.mark_foreign_hands):
mark_foreign_hands.id = index
if MarkForeignHands in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(mark_foreign_hands)
mark_foreign_hands.attach_word_to_tree(self.page_tree)
for index, text_connection_mark in enumerate(self.text_connection_marks):
text_connection_mark.id = index
if TextConnectionMark in include_special_words_of_type:
for func in update_function_on_word:
if callable(update_function_on_word):
func(text_connection_mark)
text_connection_mark.attach_word_to_tree(self.page_tree)
else:
print('locked')
def update_data_source(self, faksimile_svgFile=None, xml_correction_file=None):
"""Update the data source of page.
"""
if faksimile_svgFile is not None:
self.faksimile_svgFile = faksimile_svgFile
data_node = self.page_tree.xpath('.//data-source')[0]\
if len(self.page_tree.xpath('.//data-source')) > 0\
else ET.SubElement(self.page_tree.getroot(), 'data-source')
data_node.set('file', self.faksimile_svgFile)
if xml_correction_file is not None:
data_node.set('xml-corrected-words', xml_correction_file)
def update_line_number_area(self, transkription_field, svg_tree=None, set_to_text_field_zero=True):
"""Determines the width of the area where the line numbers are written in the page.source file.
"""
THRESHOLD = 0.4
if svg_tree is None:
svg_tree = ET.parse(self.source)
if len(self.line_numbers) > 1:
line_number = self.line_numbers[9]\
if transkription_field.is_page_verso() and len(self.line_numbers) > 8\
else self.line_numbers[1]
ln_nodes = [ item for item in svg_tree.iterfind('//text', svg_tree.getroot().nsmap)\
if Matrix.IS_NEARX_TRANSKRIPTION_FIELD(item.get('transform'), transkription_field)\
and LineNumber.IS_A_LINE_NUMBER(item)\
and LineNumber(raw_text_node=item).id == line_number.id ]
if len(ln_nodes) > 0:
matrix = Matrix(transform_matrix_string=ln_nodes[0].get('transform'))
if transkription_field.is_page_verso():
transkription_field.add_line_number_area_width(matrix.getX())
elif self.svg_file is not None and isfile(self.svg_file):
svg_path_tree = ET.parse(self.svg_file)
namespaces = { k if k is not None else 'ns': v for k, v in svg_path_tree.getroot().nsmap.items() }
svg_x = matrix.getX()
svg_y = self.line_numbers[1].bottom + transkription_field.ymin\
if set_to_text_field_zero\
else self.line_numbers[1].bottom
use_nodes = svg_path_tree.xpath('//ns:use[@x>="{0}" and @x<="{1}" and @y>="{2}" and @y<="{3}"]'\
.format(svg_x-THRESHOLD, svg_x+THRESHOLD,svg_y-THRESHOLD, svg_y+THRESHOLD), namespaces=namespaces)
if len(use_nodes) > 0:
symbol_id = use_nodes[0].get('{%s}href' % namespaces['xlink']).replace('#', '')
d_strings = use_nodes[0].xpath('//ns:symbol[@id="{0}"]/ns:path/@d'.format(symbol_id), namespaces=namespaces)
if len(d_strings) > 0 and d_strings[0] != '':
path = parse_path(d_strings[0])
xmin, xmax, ymin, ymax = path.bbox()
width = xmax - xmin
transkription_field.add_line_number_area_width(matrix.getX() + width)
def update_page_type(self, transkription_field=None):
"""Adds a source to page and attaches it to page_tree.
"""
if self.number.endswith('r')\
or self.number.endswith('v'):
self.page_type = Page.PAGE_VERSO\
if self.number.endswith('v')\
else Page.PAGE_RECTO
else:
if transkription_field is None:
if self.source is None or not isfile(self.source):
raise FileNotFoundError('Page does not have a source!')
transkription_field = TranskriptionField(self.source, multipage_index=self.multipage_index)
self.page_type = Page.PAGE_VERSO\
if transkription_field.is_page_verso()\
else Page.PAGE_RECTO
self.page_tree.getroot().set('pageType', self.page_type)
def update_styles(self, words=None, manuscript=None, add_to_parents=False, partition_according_to_styles=False, create_css=False):
"""Update styles of words and add them to their transkription_positions.
Args:
add_to_parents: Add styles also to word (and if not None to manuscript).
partition_according_to_styles: Partition word if its transkription_positions have different styles.
"""
style_dictionary = {}
if words is None:
words = self.words
for word in words:
if len(word.word_parts) > 0:
self.update_styles(words=word.word_parts, manuscript=manuscript, create_css=create_css,\
add_to_parents=add_to_parents, partition_according_to_styles=partition_according_to_styles)
for transkription_position in word.transkription_positions:
if len(transkription_position.positional_word_parts) > 0:
style_class = transkription_position.positional_word_parts[0].style_class
writing_process_id = -1
for font_key in [ font_key for font_key in style_class.split(' ') if font_key in self.fontsizekey2stage_mapping.keys() ]:
writing_process_id = self.fontsizekey2stage_mapping.get(font_key)
style_class_key = (Style.remove_irrelevant_style_keys(style_class, self, extended_styles=create_css), writing_process_id)
if create_css:
if style_dictionary.get((style_class_key, word.deleted)) is None:
color = None
if len(word.deletion_paths) > 0:
if word.deletion_paths[0].style_class is not None\
and word.deletion_paths[0].style_class != ''\
and self.style_dict.get(word.deletion_paths[0].style_class) is not None:
color = Color.create_cls_from_style_object(self.style_dict.get(word.deletion_paths[0].style_class))
else:
color = Color()
style_dictionary[(style_class_key, word.deleted)] = Style.create_cls(self, style_class_key[0], manuscript=manuscript,\
create_css=create_css, deletion_color=color, writing_process_id=style_class_key[1] )
transkription_position.style = style_dictionary[(style_class_key, word.deleted)]
#print(style_dictionary[(style_class_key, word.deleted)])
else:
if style_dictionary.get(style_class_key) is None:
style_dictionary[style_class_key] = Style.create_cls(self, style_class_key[0], manuscript=manuscript, create_css=create_css)
style_dictionary[style_class_key].writing_process_id = style_class_key[1]
transkription_position.style = style_dictionary[style_class_key]
if add_to_parents and transkription_position.style not in word.styles:
word.styles.append(transkription_position.style)
if partition_according_to_styles:
word.split_according_to_status('style', splits_are_parts=True)
if manuscript is not None\
and add_to_parents:
manuscript.update_styles(*style_dictionary.values())
+ def __eq__(self, other):
+ """Returns true if self is qualitatively identical to other.
+ """
+ if other is None:
+ return False
+ if self.page_tree is None and other.page_tree is None:
+ return self.number == other.number
+ if self.page_tree is None or other.page_tree is None:
+ return False
+ return self.page_tree.docinfo.URL == other.page_tree.docinfo.URL
+
+ def __hash__(self):
+ """Return a hash value for self.
+ """
+ try:
+ if self.page_tree is None:
+ return hash(self.number)
+ except AttributeError:
+ print(self)
+ return hash(self.number)
+ return hash(self.page_tree.docinfo.URL)
Index: svgscripts/datatypes/text.py
===================================================================
--- svgscripts/datatypes/text.py (revision 107)
+++ svgscripts/datatypes/text.py (revision 108)
@@ -1,185 +1,219 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent a text that may have standoff markup.
"""
# Copyright (C) University of Basel 2020 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
import copy
from lxml import etree as ET
import re
import sys
from .attachable_object import AttachableObject
from .standoff_tag import StandoffTag
sys.path.append('py2ttl')
from class_spec import SemanticClass
+
class Text(AttachableObject,SemanticClass):
"""
This class represents a text that may have standoff markup.
"""
TAG_PATTERN = re.compile(r'([^<]*)(<[^/]+>)')
+ #START_TAG_PATTERN = re.compile(r'.*<[a-z]+>')
+ START_TAG_PATTERN = re.compile(r'[^<]*(?!)[^<]*<[a-z]+>')
XML_TAG = 'text-with-markup'
XML_SUB_TAG = 'text'
def __init__(self, content=None, standoff_markups=None, id=0, tag=XML_TAG):
self.id = str(id)
self.tag = tag
self.content = content
self.standoff_markups = standoff_markups\
if standoff_markups is not None\
else []
def append(self, content: str) -> int:
"""Extend text with content.
[:return:] startIndex of appended content
"""
startIndex = len(self.content)
self.content += content
return startIndex
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.xpath('.//' + self.tag + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree, self.tag)
obj_node.set('id', self.id)
text_node = ET.SubElement(obj_node, self.XML_SUB_TAG)
text_node.text = self.content
for index, markup in enumerate(self.standoff_markups):
markup.id = str(index)
markup.attach_object_to_tree(obj_node)
def extract_part(self, text_part, css_filter=';'):
"""Extract part of text for which text_part matchs content.
:return: datatypes.text.Text
"""
if not css_filter.endswith(';'):
css_filter += ';'
if text_part in self.content:
part_start_index = self.content.find(text_part)
part_end_index = part_start_index + len(text_part)
standoff_markups = [ markup for markup in self.standoff_markups\
if markup.css_string.endswith(css_filter)\
if (markup.startIndex <= part_start_index\
and markup.endIndex > part_start_index)\
or (markup.startIndex >= part_start_index\
and markup.startIndex < part_end_index\
and markup.endIndex <= part_end_index)\
or (markup.startIndex < part_end_index\
and markup.endIndex >= part_end_index)]
new_markups = []
for markup in standoff_markups:
startIndex = markup.startIndex - part_start_index\
if markup.startIndex > part_start_index else 0
endIndex = markup.endIndex - part_start_index\
if markup.endIndex <= part_end_index\
else len(text_part)
new_markups.append(StandoffTag(markup.markup, startIndex, endIndex))
return Text(content=text_part, standoff_markups=new_markups)
else:
msg = f'ERRROR {text_part} is not a part of {self.content}!'
raise Exception(msg)
def join(self, other):
"""Join self and other.
"""
correction = self.append(' ' + other.content) + 1
for standoff_markup in other.standoff_markups:
standoff_markup.startIndex += correction
standoff_markup.endIndex += correction
self.standoff_markups += other.standoff_markups
del other
def markup_contains_css_filter(self, css_filter) ->bool:
"""Returns true if markup contains css_filter.
"""
if not css_filter.endswith(';'):
css_filter += ';'
return len([ markup for markup in self.standoff_markups\
if markup.css_string.endswith(css_filter) ]) > 0
@classmethod
def create_cls_from_node(cls, node):
"""Initialize a cls from node.
[:return:] cls
"""
standoff_markups = [ StandoffTag.create_cls_from_node(item) for item in\
node.xpath('./' + '|./'.join(StandoffTag.MARKUP_STYLES)) ]
text = node.xpath('./' + cls.XML_SUB_TAG + '/text()')[0]\
if len(node.xpath('./' + cls.XML_SUB_TAG + '/text()')) > 0\
else ''
return cls(text, standoff_markups=standoff_markups, id=node.get('id'), tag=node.tag)
@classmethod
def create_cls_from_html(cls, html):
"""Creates a Text from a html string.
:return: a (datatypes.text) Text
"""
- standoff_markups = []
+ html = html.replace('<', '<').replace('>', '>')
+ """
tag_matched = re.match(cls.TAG_PATTERN, html)
while tag_matched is not None:
tag = tag_matched.group(2)
tags = [ t for t in tag.split('<') if t != '']
tags.reverse()
endTag = ''.join([ '' + t for t in tags])
startIndex = tag_matched.end() - len(tag)
inner_tag_matched = re.match(cls.TAG_PATTERN, html[0:startIndex])
html = html[0:startIndex] + html[tag_matched.end():]
endTag_matched = re.match(rf'(.*)({endTag})', html)
if endTag_matched is not None:
endIndex = endTag_matched.end() - len(endTag)
html = html[0:endIndex] + html[endTag_matched.end():]
for markup in [ StandoffTag.HTML_TAG_DICTIONARY['<'+tag] for tag in tags\
if bool(StandoffTag.HTML_TAG_DICTIONARY.get('<'+tag)) ]:
standoff_markups.append(StandoffTag(markup, startIndex, endIndex))
else:
msg = f'HTML string contains no ending tag for {tag}!'
raise Exception(msg)
tag_matched = re.match(cls.TAG_PATTERN, html)
+ """
+ html, standoff_markups = extract_standoff_data(html)
return cls(html, standoff_markups=standoff_markups)
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
properties = {}
properties.update(cls.create_semantic_property_dictionary('content', str, cardinality=1,\
name='textHasContent', label='content of text', comment='Connects a text with its content.'))
properties.update(cls.create_semantic_property_dictionary('standoff_markups', StandoffTag,\
name='textHasMarkup', label='standoff markup of text', comment='Connects a text with a list of standoff tags.'))
dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties }
return cls.return_dictionary_after_updating_super_classes(dictionary)
+def extract_standoff_data(html) ->(str, list):
+ """Extract standoff data and return the html string without tags and a list of standoff data.
+ """
+ standoff_markups = []
+ tag_matched = re.match(Text.START_TAG_PATTERN, html)
+ while tag_matched:
+ tag = re.sub(r'>.*', '', re.sub(r'^[^<]+<', '', tag_matched.group(0)))
+ startIndex = html.index(f'<{tag}>')
+ html = re.sub(rf'<{tag}>', '', html, count=1)
+ contains_tag_pattern = rf'.*<[a-z]+>.*{tag}>.*'
+ if re.match(contains_tag_pattern, html):
+ html, new_standoff_data = extract_standoff_data(html)
+ standoff_markups += new_standoff_data
+ end_tag_pattern = rf'.*{tag}>.*'
+ endTag_matched = re.match(end_tag_pattern, html)
+ if endTag_matched is not None:
+ endIndex = html.index(f'{tag}>')
+ html = html[0:endIndex] + html[endIndex+len(f'{tag}>'):]
+ if bool(StandoffTag.HTML_TAG_DICTIONARY.get(f'<{tag}>')):
+ standoff_markups.append(StandoffTag(StandoffTag.HTML_TAG_DICTIONARY[f'<{tag}>'], startIndex, endIndex))
+ else:
+ msg = f'HTML string contains no ending tag for {tag}!'
+ raise Exception(msg)
+ tag_matched = re.match(Text.START_TAG_PATTERN, html)
+ return html, standoff_markups
+
+
+
Index: svgscripts/datatypes/standoff_tag.py
===================================================================
--- svgscripts/datatypes/standoff_tag.py (revision 107)
+++ svgscripts/datatypes/standoff_tag.py (revision 108)
@@ -1,151 +1,152 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent the standoff markup of a text.
"""
# Copyright (C) University of Basel 2020 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
import re
import sys
from .attachable_object import AttachableObject
sys.path.append('py2ttl')
from class_spec import SemanticClass
class StandoffTag(AttachableObject,SemanticClass):
"""
This class represents the standoff markup of a text.
"""
- MARKUP_STYLES = [ 'bold', 'italic', 'delete' ]
+ MARKUP_STYLES = [ 'bold', 'italic', 'delete', 'underline' ]
RDFS_SUBCLASSOF_LIST = ['http://www.nie.org/ontology/standoff#StandoffMarkup']
RELEVANT_STYLE_KEY = 'font-family'
RELEVANT_CONTENT_STARTSWITH = 'Frutiger-'
RELEVANT_PATTERN = re.compile('.*(Italic|Bold)$')
RELEVANT_SUB_PATTERN = re.compile('Frutiger-(Light)*')
STOFF_HAS_CSS_URL_STRING = 'http://www.nie.org/ontology/standoff#hasCSS'
STOFF_HAS_START_INDEX = 'http://www.nie.org/ontology/standoff#standoffMarkupHasStartIndex'
STOFF_HAS_END_INDEX = 'http://www.nie.org/ontology/standoff#standoffMarkupHasEndIndex'
- HTML_TAG_DICTIONARY = { '': 'italic', '': 'bold', '': 'delete' }
+ HTML_TAG_DICTIONARY = { '': 'italic', '': 'bold', '': 'delete', '': 'underline' }
CSS_DICTIONARY = { 'bold': 'font-weight:bold;',
'italic': 'font-style: italic;',
+ 'underline': 'text-decoration:underline;',
'delete': 'text-decoration:line-through;' }
def __init__(self, markup: str, startIndex: int, endIndex: int, id=0):
self.id = str(id)
self.css_string = self.CSS_DICTIONARY.get(markup)
self.markup = markup
self.startIndex = startIndex
self.endIndex = endIndex
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree, self.markup)
obj_node.set('id', self.id)
obj_node.set('start', str(self.startIndex))
obj_node.set('end', str(self.endIndex))
@classmethod
def create_cls(cls, start_index, end_index, style_string, page=None, style_dict=None):
"""Creates a StandoffTag from a style_string.
:return: a list of (datatypes.standoff_tag) StandoffTag
"""
if page is not None:
style_dict = cls.create_relevant_style_dictionary(page)
relevant_keys = [ key for key in set(style_string.split(' '))\
if key in style_dict.keys() ]
standoff_tags = []
if style_dict is None or len(style_dict) == 0:
return standoff_tags
for relevant_key in relevant_keys:
font_family = style_dict[relevant_key][cls.RELEVANT_STYLE_KEY]
if re.match(cls.RELEVANT_PATTERN, font_family):
markup = re.sub(cls.RELEVANT_SUB_PATTERN, '', font_family).lower()
standoff_tags.append(cls(markup, start_index, end_index))
return standoff_tags
@classmethod
def create_cls_from_node(cls, node):
"""Creates a StandoffTag from a node.
:return: (datatypes.standoff_tag) StandoffTag
"""
return cls(node.tag, int(node.get('start')), int(node.get('end')), id=node.get('id'))
@classmethod
def create_relevant_style_dictionary(cls, page):
"""Return a style dictionary that contains only relevant keys and contents.
"""
return { key: key_dict for key, key_dict in page.style_dict.items()\
if cls.RELEVANT_STYLE_KEY in key_dict.keys()\
and key_dict[cls.RELEVANT_STYLE_KEY].startswith(cls.RELEVANT_CONTENT_STARTSWITH) }
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
properties = {}
#properties.update(cls.create_semantic_property_dictionary('markup', str, cardinality=1,\
# name='standoffTagHasMarkup', label='standoff tag has a specific markup', comment='Connects a standoff tag with its markup, e.g. bold or italic'))
properties.update(cls.create_semantic_property_dictionary('startIndex', int, cardinality=1, subPropertyOf=cls.STOFF_HAS_START_INDEX,\
name='standoffTagHasStartIndex', label='standoff tag has a start index', comment='Connects a standoff tag with its start index.'))
properties.update(cls.create_semantic_property_dictionary('endIndex', int, cardinality=1, subPropertyOf=cls.STOFF_HAS_END_INDEX,\
name='standoffTagHasEndIndex', label='standoff tag has a end index', comment='Connects a standoff tag with its end index.'))
properties.update(cls.create_semantic_property_dictionary('css_string', str,\
subPropertyOf=cls.STOFF_HAS_CSS_URL_STRING,\
name='standoffTagHasCSS', label='standoff tag has css', comment='Connects a standoff tag with CSS style.'))
dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties }
return cls.return_dictionary_after_updating_super_classes(dictionary)
def is_joinable(self, other):
"""Return true if self and other have same markup and self.endIndex == other.startIndex.
"""
return self.markup == other.markup and self.endIndex == other.startIndex
def join(self, other):
"""Join self with other.
"""
self.endIndex = other.endIndex
def join_list(self, others):
"""Join all others that are joinable, return remaining others as a list.
"""
unjoinable_others = []
for other in others:
if self.is_joinable(other):
self.join(other)
else:
unjoinable_others.append(other)
return unjoinable_others
Index: tests_svgscripts/test_description.py
===================================================================
--- tests_svgscripts/test_description.py (revision 107)
+++ tests_svgscripts/test_description.py (revision 108)
@@ -1,40 +1,40 @@
import unittest
from os import sep, path
from os.path import dirname, basename, isfile, isdir
import lxml.etree as ET
import sys
sys.path.append('svgscripts')
from datatypes.page import Page
from datatypes.standoff_tag import StandoffTag
from datatypes.text import Text
from datatypes.description import Description
class TestText(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.test_file = DATADIR + sep + 'test.xml'
self.test_svg_file = DATADIR + sep + 'test421.svg'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml'
self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.test_page = DATADIR + sep + 'N_VII_1_page001.xml'
self.test_manuscript = DATADIR + sep + 'N_VII_1.xml'
def test_semantic(self):
pass
#print(Text.get_semantic_dictionary())
def test_create_cls_from_node(self):
tree = ET.parse(self.test_manuscript)
node = tree.xpath('description/earlierDescription[@id="1"]/manuscriptDescription')[0]
description = Description.create_cls_from_node(node)
- #print(description.content)
self.assertTrue(len(description.standoff_markups) > 0)
+ #print(description.content, description.standoff_markups)
if __name__ == "__main__":
unittest.main()
Index: tests_svgscripts/test_text.py
===================================================================
--- tests_svgscripts/test_text.py (revision 107)
+++ tests_svgscripts/test_text.py (revision 108)
@@ -1,91 +1,94 @@
import unittest
from os import sep, path
from os.path import dirname, basename, isfile, isdir
import lxml.etree as ET
import sys
sys.path.append('svgscripts')
from datatypes.page import Page
from datatypes.standoff_tag import StandoffTag
from datatypes.text import Text
class TestText(unittest.TestCase):
def setUp(self):
DATADIR = dirname(__file__) + sep + 'test_data'
if not isdir(DATADIR):
DATADIR = dirname(dirname(__file__)) + sep + 'test_data'
self.test_file = DATADIR + sep + 'test.xml'
self.test_svg_file = DATADIR + sep + 'test421.svg'
self.pdf_xml = DATADIR + sep + 'W_I_8_page125.xml'
self.xml_file = DATADIR + sep + 'N_VII_1_page005.xml'
self.xml_fileB = DATADIR + sep + 'N_VII_1_page006.xml'
self.pdf_xml_source = DATADIR + sep + 'W_I_8_neu_125-01.svg'
self.test_page = DATADIR + sep + 'N_VII_1_page001.xml'
self.test_manuscript = DATADIR + sep + 'N_VII_1.xml'
def test_semantic(self):
pass
#print(Text.get_semantic_dictionary())
def test_attach_to_tree(self):
empty_tree = ET.ElementTree(ET.Element('page'))
content = 'asdf'
standoff_tag = StandoffTag('bold', 0, len(content)-1)
standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content),id='1')
text = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ])
text.attach_object_to_tree(empty_tree)
text = Text.create_cls_from_node(empty_tree.xpath('//' + Text.XML_TAG)[0])
self.assertEqual(text.content, content)
self.assertEqual(text.id, '0')
self.assertEqual(len(text.standoff_markups), 2)
#print(ET.dump(empty_tree.getroot()))
def test_extract(self):
content = 'asdfa'
standoff_tag = StandoffTag('bold', 0, len(content)-2)
standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content)-1,id='1')
textA = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ])
textB = textA.extract_part('sdf')
self.assertEqual(len(textB.standoff_markups), 2)
textB = textA.extract_part('sdf', css_filter='bold')
self.assertEqual(len(textB.standoff_markups), 1)
"""
content = '26: von „Regel]¿'
textA = Text(content, standoff_markups=[ StandoffTag('bold', 6, 9)])
print(textA.extract_part('von', css_filter='bold'))
print(textA.extract_part('„Regel', css_filter='bold'))
"""
def test_markup_contains_css_filter(self):
content = 'asdfa'
standoff_tag = StandoffTag('bold', 0, len(content)-2)
standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content)-1,id='1')
textA = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ])
self.assertTrue(textA.markup_contains_css_filter('bold'))
self.assertTrue(textA.markup_contains_css_filter('italic'))
textA.standoff_markups.pop(0)
self.assertFalse(textA.markup_contains_css_filter('bold'))
def test_join(self):
content = 'asdfa'
standoff_tag = StandoffTag('bold', 0, len(content)-2)
standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content)-1,id='1')
textA = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ])
standoff_tag = StandoffTag('bold', 0, len(content)-2)
standoff_tag2 = StandoffTag('italic', int(len(content)/2), len(content)-1,id='1')
textB = Text(content, standoff_markups=[ standoff_tag, standoff_tag2 ])
textA.join(textB)
self.assertEqual(textA.content, content + ' ' + content)
def test_create_from_html(self):
html = 'asdf test the best'
text = Text.create_cls_from_html(html)
self.assertEqual(len(text.standoff_markups), 3)
self.assertEqual(text.standoff_markups[0].startIndex, text.standoff_markups[1].startIndex)
self.assertEqual(text.standoff_markups[0].endIndex, text.standoff_markups[1].endIndex)
html = 'asdf test'
text = Text.create_cls_from_html(html)
self.assertEqual(len(text.standoff_markups), 1)
+ html = 'Quart-, Oktav- und Folioblätter verschiedenen Formats (z. T. von Albert Brenners und Peter Gasts Hand); Entwürfe und Vorstufen aus dem Bereiche des <i>Menschlichen I</i> (die sogenannten <i>Sorrentiner Papiere</i>)'
+ text = Text.create_cls_from_html(html)
+ #print(text)
if __name__ == "__main__":
unittest.main()