Page MenuHomec4science

standoff_tag.py
No OneTemporary

File Metadata

Created
Mon, May 6, 16:30

standoff_tag.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent the standoff markup of a text.
"""
# Copyright (C) University of Basel 2020 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__ = "Christian Steiner"
__maintainer__ = __author__
__copyright__ = 'University of Basel'
__email__ = "christian.steiner@unibas.ch"
__status__ = "Development"
__license__ = "GPL v3"
__version__ = "0.0.1"
from lxml import etree as ET
import re
import sys
from .attachable_object import AttachableObject
sys.path.append('py2ttl')
from class_spec import SemanticClass
class StandoffTag(AttachableObject,SemanticClass):
"""
This class represents the standoff markup of a text.
"""
MARKUP_STYLES = [ 'bold', 'italic', 'delete', 'underline' ]
RDFS_SUBCLASSOF_LIST = ['http://www.nie.org/ontology/standoff#StandoffMarkup']
RELEVANT_STYLE_KEY = 'font-family'
RELEVANT_CONTENT_STARTSWITH = 'Frutiger-'
RELEVANT_PATTERN = re.compile('.*(Italic|Bold)$')
RELEVANT_SUB_PATTERN = re.compile('Frutiger-(Light)*')
STOFF_HAS_CSS_URL_STRING = 'http://www.nie.org/ontology/standoff#hasCSS'
STOFF_HAS_START_INDEX = 'http://www.nie.org/ontology/standoff#standoffMarkupHasStartIndex'
STOFF_HAS_END_INDEX = 'http://www.nie.org/ontology/standoff#standoffMarkupHasEndIndex'
HTML_TAG_DICTIONARY = { '<i>': 'italic', '<b>': 'bold', '<del>': 'delete', '<underline>': 'underline' }
CSS_DICTIONARY = { 'bold': 'font-weight:bold;',
'italic': 'font-style: italic;',
'underline': 'text-decoration:underline;',
'delete': 'text-decoration:line-through;' }
def __init__(self, markup: str, startIndex: int, endIndex: int, id=0):
self.id = str(id)
self.css_string = self.CSS_DICTIONARY.get(markup)
self.markup = markup
self.startIndex = startIndex
self.endIndex = endIndex
def attach_object_to_tree(self, target_tree):
"""Attach object to tree.
"""
if target_tree.__class__.__name__ == '_ElementTree':
target_tree = target_tree.getroot()
obj_node = target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)[0] \
if(len(target_tree.xpath('.//' + self.markup + '[@id="%s"]' % self.id)) > 0) \
else ET.SubElement(target_tree, self.markup)
obj_node.set('id', self.id)
obj_node.set('start', str(self.startIndex))
obj_node.set('end', str(self.endIndex))
@classmethod
def create_cls(cls, start_index, end_index, style_string, page=None, style_dict=None):
"""Creates a StandoffTag from a style_string.
:return: a list of (datatypes.standoff_tag) StandoffTag
"""
if page is not None:
style_dict = cls.create_relevant_style_dictionary(page)
relevant_keys = [ key for key in set(style_string.split(' '))\
if key in style_dict.keys() ]
standoff_tags = []
if style_dict is None or len(style_dict) == 0:
return standoff_tags
for relevant_key in relevant_keys:
font_family = style_dict[relevant_key][cls.RELEVANT_STYLE_KEY]
if re.match(cls.RELEVANT_PATTERN, font_family):
markup = re.sub(cls.RELEVANT_SUB_PATTERN, '', font_family).lower()
standoff_tags.append(cls(markup, start_index, end_index))
return standoff_tags
@classmethod
def create_cls_from_node(cls, node):
"""Creates a StandoffTag from a node.
:return: (datatypes.standoff_tag) StandoffTag
"""
return cls(node.tag, int(node.get('start')), int(node.get('end')), id=node.get('id'))
@classmethod
def create_relevant_style_dictionary(cls, page):
"""Return a style dictionary that contains only relevant keys and contents.
"""
return { key: key_dict for key, key_dict in page.style_dict.items()\
if cls.RELEVANT_STYLE_KEY in key_dict.keys()\
and key_dict[cls.RELEVANT_STYLE_KEY].startswith(cls.RELEVANT_CONTENT_STARTSWITH) }
@classmethod
def get_semantic_dictionary(cls):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
properties = {}
#properties.update(cls.create_semantic_property_dictionary('markup', str, cardinality=1,\
# name='standoffTagHasMarkup', label='standoff tag has a specific markup', comment='Connects a standoff tag with its markup, e.g. bold or italic'))
properties.update(cls.create_semantic_property_dictionary('startIndex', int, cardinality=1, subPropertyOf=cls.STOFF_HAS_START_INDEX,\
name='standoffTagHasStartIndex', label='standoff tag has a start index', comment='Connects a standoff tag with its start index.'))
properties.update(cls.create_semantic_property_dictionary('endIndex', int, cardinality=1, subPropertyOf=cls.STOFF_HAS_END_INDEX,\
name='standoffTagHasEndIndex', label='standoff tag has a end index', comment='Connects a standoff tag with its end index.'))
properties.update(cls.create_semantic_property_dictionary('css_string', str,\
subPropertyOf=cls.STOFF_HAS_CSS_URL_STRING,\
name='standoffTagHasCSS', label='standoff tag has css', comment='Connects a standoff tag with CSS style.'))
dictionary = { cls.CLASS_KEY: cls.get_class_dictionary(), cls.PROPERTIES_KEY: properties }
return cls.return_dictionary_after_updating_super_classes(dictionary)
def is_joinable(self, other):
"""Return true if self and other have same markup and self.endIndex == other.startIndex.
"""
return self.markup == other.markup and self.endIndex == other.startIndex
def join(self, other):
"""Join self with other.
"""
self.endIndex = other.endIndex
def join_list(self, others):
"""Join all others that are joinable, return remaining others as a list.
"""
unjoinable_others = []
for other in others:
if self.is_joinable(other):
self.join(other)
else:
unjoinable_others.append(other)
return unjoinable_others

Event Timeline