Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F61437409
standoff_tag.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, May 6, 16:30
Size
6 KB
Mime Type
text/x-python
Expires
Wed, May 8, 16:30 (2 d)
Engine
blob
Format
Raw Data
Handle
17512788
Attached To
rNIETZSCHEPYTHON nietzsche-python
standoff_tag.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This class can be used to represent the standoff markup of a text.
"""
# Copyright (C) University of Basel 2020 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
from
lxml
import
etree
as
ET
import
re
import
sys
from
.attachable_object
import
AttachableObject
sys
.
path
.
append
(
'py2ttl'
)
from
class_spec
import
SemanticClass
class
StandoffTag
(
AttachableObject
,
SemanticClass
):
"""
This class represents the standoff markup of a text.
"""
MARKUP_STYLES
=
[
'bold'
,
'italic'
,
'delete'
,
'underline'
]
RDFS_SUBCLASSOF_LIST
=
[
'http://www.nie.org/ontology/standoff#StandoffMarkup'
]
RELEVANT_STYLE_KEY
=
'font-family'
RELEVANT_CONTENT_STARTSWITH
=
'Frutiger-'
RELEVANT_PATTERN
=
re
.
compile
(
'.*(Italic|Bold)$'
)
RELEVANT_SUB_PATTERN
=
re
.
compile
(
'Frutiger-(Light)*'
)
STOFF_HAS_CSS_URL_STRING
=
'http://www.nie.org/ontology/standoff#hasCSS'
STOFF_HAS_START_INDEX
=
'http://www.nie.org/ontology/standoff#standoffMarkupHasStartIndex'
STOFF_HAS_END_INDEX
=
'http://www.nie.org/ontology/standoff#standoffMarkupHasEndIndex'
HTML_TAG_DICTIONARY
=
{
'<i>'
:
'italic'
,
'<b>'
:
'bold'
,
'<del>'
:
'delete'
,
'<underline>'
:
'underline'
}
CSS_DICTIONARY
=
{
'bold'
:
'font-weight:bold;'
,
'italic'
:
'font-style: italic;'
,
'underline'
:
'text-decoration:underline;'
,
'delete'
:
'text-decoration:line-through;'
}
def
__init__
(
self
,
markup
:
str
,
startIndex
:
int
,
endIndex
:
int
,
id
=
0
):
self
.
id
=
str
(
id
)
self
.
css_string
=
self
.
CSS_DICTIONARY
.
get
(
markup
)
self
.
markup
=
markup
self
.
startIndex
=
startIndex
self
.
endIndex
=
endIndex
def
attach_object_to_tree
(
self
,
target_tree
):
"""Attach object to tree.
"""
if
target_tree
.
__class__
.
__name__
==
'_ElementTree'
:
target_tree
=
target_tree
.
getroot
()
obj_node
=
target_tree
.
xpath
(
'.//'
+
self
.
markup
+
'[@id="
%s
"]'
%
self
.
id
)[
0
]
\
if
(
len
(
target_tree
.
xpath
(
'.//'
+
self
.
markup
+
'[@id="
%s
"]'
%
self
.
id
))
>
0
)
\
else
ET
.
SubElement
(
target_tree
,
self
.
markup
)
obj_node
.
set
(
'id'
,
self
.
id
)
obj_node
.
set
(
'start'
,
str
(
self
.
startIndex
))
obj_node
.
set
(
'end'
,
str
(
self
.
endIndex
))
@classmethod
def
create_cls
(
cls
,
start_index
,
end_index
,
style_string
,
page
=
None
,
style_dict
=
None
):
"""Creates a StandoffTag from a style_string.
:return: a list of (datatypes.standoff_tag) StandoffTag
"""
if
page
is
not
None
:
style_dict
=
cls
.
create_relevant_style_dictionary
(
page
)
relevant_keys
=
[
key
for
key
in
set
(
style_string
.
split
(
' '
))
\
if
key
in
style_dict
.
keys
()
]
standoff_tags
=
[]
if
style_dict
is
None
or
len
(
style_dict
)
==
0
:
return
standoff_tags
for
relevant_key
in
relevant_keys
:
font_family
=
style_dict
[
relevant_key
][
cls
.
RELEVANT_STYLE_KEY
]
if
re
.
match
(
cls
.
RELEVANT_PATTERN
,
font_family
):
markup
=
re
.
sub
(
cls
.
RELEVANT_SUB_PATTERN
,
''
,
font_family
)
.
lower
()
standoff_tags
.
append
(
cls
(
markup
,
start_index
,
end_index
))
return
standoff_tags
@classmethod
def
create_cls_from_node
(
cls
,
node
):
"""Creates a StandoffTag from a node.
:return: (datatypes.standoff_tag) StandoffTag
"""
return
cls
(
node
.
tag
,
int
(
node
.
get
(
'start'
)),
int
(
node
.
get
(
'end'
)),
id
=
node
.
get
(
'id'
))
@classmethod
def
create_relevant_style_dictionary
(
cls
,
page
):
"""Return a style dictionary that contains only relevant keys and contents.
"""
return
{
key
:
key_dict
for
key
,
key_dict
in
page
.
style_dict
.
items
()
\
if
cls
.
RELEVANT_STYLE_KEY
in
key_dict
.
keys
()
\
and
key_dict
[
cls
.
RELEVANT_STYLE_KEY
]
.
startswith
(
cls
.
RELEVANT_CONTENT_STARTSWITH
)
}
@classmethod
def
get_semantic_dictionary
(
cls
):
""" Creates a semantic dictionary as specified by SemanticClass.
"""
properties
=
{}
#properties.update(cls.create_semantic_property_dictionary('markup', str, cardinality=1,\
# name='standoffTagHasMarkup', label='standoff tag has a specific markup', comment='Connects a standoff tag with its markup, e.g. bold or italic'))
properties
.
update
(
cls
.
create_semantic_property_dictionary
(
'startIndex'
,
int
,
cardinality
=
1
,
subPropertyOf
=
cls
.
STOFF_HAS_START_INDEX
,
\
name
=
'standoffTagHasStartIndex'
,
label
=
'standoff tag has a start index'
,
comment
=
'Connects a standoff tag with its start index.'
))
properties
.
update
(
cls
.
create_semantic_property_dictionary
(
'endIndex'
,
int
,
cardinality
=
1
,
subPropertyOf
=
cls
.
STOFF_HAS_END_INDEX
,
\
name
=
'standoffTagHasEndIndex'
,
label
=
'standoff tag has a end index'
,
comment
=
'Connects a standoff tag with its end index.'
))
properties
.
update
(
cls
.
create_semantic_property_dictionary
(
'css_string'
,
str
,
\
subPropertyOf
=
cls
.
STOFF_HAS_CSS_URL_STRING
,
\
name
=
'standoffTagHasCSS'
,
label
=
'standoff tag has css'
,
comment
=
'Connects a standoff tag with CSS style.'
))
dictionary
=
{
cls
.
CLASS_KEY
:
cls
.
get_class_dictionary
(),
cls
.
PROPERTIES_KEY
:
properties
}
return
cls
.
return_dictionary_after_updating_super_classes
(
dictionary
)
def
is_joinable
(
self
,
other
):
"""Return true if self and other have same markup and self.endIndex == other.startIndex.
"""
return
self
.
markup
==
other
.
markup
and
self
.
endIndex
==
other
.
startIndex
def
join
(
self
,
other
):
"""Join self with other.
"""
self
.
endIndex
=
other
.
endIndex
def
join_list
(
self
,
others
):
"""Join all others that are joinable, return remaining others as a list.
"""
unjoinable_others
=
[]
for
other
in
others
:
if
self
.
is_joinable
(
other
):
self
.
join
(
other
)
else
:
unjoinable_others
.
append
(
other
)
return
unjoinable_others
Event Timeline
Log In to Comment