Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F65149412
convert_wordPositions.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jun 1, 05:47
Size
20 KB
Mime Type
text/x-python
Expires
Mon, Jun 3, 05:47 (2 d)
Engine
blob
Format
Raw Data
Handle
18012615
Attached To
rNIETZSCHEPYTHON nietzsche-python
convert_wordPositions.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import
cairosvg
import
getopt
from
lxml.html
import
builder
as
E
from
lxml.html
import
open_in_browser
import
lxml
from
os
import
sep
,
listdir
,
mkdir
,
path
,
remove
from
os.path
import
exists
,
isfile
,
isdir
,
dirname
import
re
import
sys
from
svgpathtools
import
svg_to_paths
import
xml.etree.ElementTree
as
ET
if
dirname
(
__file__
)
not
in
sys
.
path
:
sys
.
path
.
append
(
dirname
(
__file__
))
from
datatypes.matrix
import
Matrix
from
datatypes.page
import
Page
from
datatypes.page_creator
import
PageCreator
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.writing_process
import
WritingProcess
from
datatypes.word
import
Word
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
class
Converter
:
"""The converter super class.
"""
def
__init__
(
self
,
page
,
non_testing
=
True
,
show_word_insertion_mark
=
False
):
self
.
page
=
page
self
.
non_testing
=
non_testing
self
.
show_word_insertion_mark
=
show_word_insertion_mark
def
_get_transkription_positions
(
self
,
transkription_positions
,
stage_version
=
''
):
"""Returns the transkription_positions of the indicated stage_version.
"""
convertable_transkription_positions
=
transkription_positions
if
stage_version
!=
''
:
convertable_transkription_positions
=
[]
if
re
.
match
(
r'^\d$'
,
stage_version
):
writing_process_id
=
int
(
stage_version
)
for
transkription_position
in
transkription_positions
:
if
transkription_position
.
writing_process_id
==
writing_process_id
:
convertable_transkription_positions
.
append
(
transkription_position
)
elif
re
.
match
(
r'^\d\+$'
,
stage_version
):
version_range
=
[
*
range
(
int
(
stage_version
.
replace
(
'+'
,
''
)),
len
(
WritingProcess
.
VERSION_DESCRIPTION
))
]
for
transkription_position
in
transkription_positions
:
if
transkription_position
.
writing_process_id
in
version_range
:
convertable_transkription_positions
.
append
(
transkription_position
)
elif
re
.
match
(
r'^\d\-\d$'
,
stage_version
):
start_stop
=
[
int
(
i
)
for
i
in
re
.
split
(
r'-'
,
stage_version
)
]
version_range
=
[
*
range
(
start_stop
[
0
],
start_stop
[
1
]
+
1
)
]
for
transkription_position
in
transkription_positions
:
if
transkription_position
.
writing_process_id
in
version_range
:
convertable_transkription_positions
.
append
(
transkription_position
)
return
convertable_transkription_positions
def
_get_words
(
self
,
words
,
highlighted_words
=
None
):
"""Return the words that will be hightlighted.
"""
return
highlighted_words
if
highlighted_words
is
not
None
else
words
def
convert
(
self
,
output_file
=
None
,
stage_version
=
''
,
highlighted_words
=
None
):
"""Prints all words.
"""
first_word_of_line
=
None
out
=
sys
.
stdout
if
output_file
is
not
None
:
out
=
open
(
output_file
,
'w'
)
for
word
in
self
.
page
.
words
:
if
first_word_of_line
is
None
or
first_word_of_line
.
line_number
!=
word
.
line_number
:
out
.
write
(
'
\n
'
)
first_word_of_line
=
word
if
word
.
line_number
%
2
==
0
:
out
.
write
(
str
(
word
.
line_number
)
.
zfill
(
2
)
+
' '
)
else
:
out
.
write
(
' '
)
if
stage_version
==
''
or
len
(
self
.
_get_transkription_positions
(
word
.
transkription_positions
,
stage_version
=
stage_version
))
>
0
:
if
word
.
text
is
not
None
:
out
.
write
(
word
.
text
+
' '
)
out
.
close
()
@classmethod
def
CREATE_CONVERTER
(
cls
,
page
,
non_testing
=
True
,
converter_type
=
''
,
show_word_insertion_mark
=
False
):
"""Returns a converter of type converter_type.
[:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None
"""
cls_dict
=
{
subclass
.
__name__
:
subclass
for
subclass
in
cls
.
__subclasses__
()
}
cls_key
=
converter_type
+
'Converter'
if
bool
(
cls_dict
.
get
(
cls_key
)):
return
cls_dict
.
get
(
cls_key
)(
page
,
non_testing
,
show_word_insertion_mark
)
else
:
return
Converter
(
page
,
non_testing
,
show_word_insertion_mark
)
class
SVGConverter
(
Converter
):
"""This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text.
"""
BG_COLOR
=
'yellow'
OPACITY
=
'0.2'
def
__init__
(
self
,
page
,
non_testing
=
True
,
show_word_insertion_mark
=
False
,
bg_color
=
BG_COLOR
,
opacity
=
OPACITY
):
Converter
.
__init__
(
self
,
page
,
non_testing
,
show_word_insertion_mark
)
self
.
bg_color
=
bg_color
self
.
opacity
=
opacity
def
convert
(
self
,
output_file
=
None
,
stage_version
=
''
,
highlighted_words
=
None
):
"""Converts Page to SVG
"""
title
=
self
.
page
.
title
if
(
self
.
page
.
title
is
not
None
)
else
'Test Page'
title
=
'{}, S. {}'
.
format
(
title
,
self
.
page
.
number
)
if
(
self
.
page
.
number
is
not
None
)
else
title
svg_file
=
self
.
page
.
svg_file
if
svg_file
is
None
and
self
.
page
.
svg_image
is
not
None
:
svg_file
=
self
.
page
.
svg_image
.
file_name
elif
svg_file
is
None
:
msg
=
f
'ERROR: xml_source_file {self.page.docinfo.URL} does neither have a svg_file nor a svg_image!'
raise
Exception
(
msg
)
transkription_field
=
TranskriptionField
(
svg_file
)
if
bool
(
transkription_field
.
get_svg_attributes
(
'xmlns'
)):
ET
.
register_namespace
(
''
,
transkription_field
.
get_svg_attributes
(
'xmlns'
))
if
bool
(
transkription_field
.
get_svg_attributes
(
'xmlns:xlink'
)):
ET
.
register_namespace
(
'xlink'
,
transkription_field
.
get_svg_attributes
(
'xmlns:xlink'
))
svg_tree
=
ET
.
parse
(
svg_file
)
transkription_node
=
ET
.
SubElement
(
svg_tree
.
getroot
(),
'g'
,
attrib
=
{
'id'
:
'Transkription'
})
colors
=
[
'yellow'
,
'orange'
]
if
self
.
bg_color
==
self
.
BG_COLOR
else
[
self
.
bg_color
]
if
highlighted_words
is
not
None
:
colors
=
[
'yellow'
]
else
:
highlighted_words
=
[]
color_index
=
0
for
word
in
self
.
page
.
words
:
word_id
=
'word_'
+
str
(
word
.
id
)
for
transkription_position
in
self
.
_get_transkription_positions
(
word
.
transkription_positions
,
stage_version
=
stage_version
):
transkription_position_id
=
word_id
+
'_'
+
str
(
transkription_position
.
id
)
color
=
colors
[
color_index
]
if
word
not
in
highlighted_words
else
self
.
bg_color
rect_node
=
ET
.
SubElement
(
transkription_node
,
'rect'
,
\
attrib
=
{
'id'
:
transkription_position_id
,
'x'
:
str
(
transkription_position
.
left
+
transkription_field
.
xmin
),
\
'y'
:
str
(
transkription_position
.
top
+
transkription_field
.
ymin
),
'width'
:
str
(
transkription_position
.
width
),
\
'height'
:
str
(
transkription_position
.
height
),
'fill'
:
color
,
'opacity'
:
self
.
opacity
})
if
transkription_position
.
transform
is
not
None
:
matrix
=
transkription_position
.
transform
.
clone_transformation_matrix
()
matrix
.
matrix
[
Matrix
.
XINDEX
]
=
round
(
transkription_position
.
transform
.
matrix
[
Matrix
.
XINDEX
]
+
transkription_field
.
xmin
,
3
)
matrix
.
matrix
[
Matrix
.
YINDEX
]
=
round
(
transkription_position
.
transform
.
matrix
[
Matrix
.
YINDEX
]
+
transkription_field
.
ymin
,
3
)
rect_node
.
set
(
'transform'
,
matrix
.
toString
())
rect_node
.
set
(
'x'
,
str
(
round
(
transkription_position
.
left
-
transkription_position
.
transform
.
matrix
[
Matrix
.
XINDEX
],
3
)))
rect_node
.
set
(
'y'
,
str
(
round
((
transkription_position
.
height
-
1.5
)
*-
1
,
3
)))
ET
.
SubElement
(
rect_node
,
'title'
)
.
text
=
word
.
text
color_index
=
(
color_index
+
1
)
%
len
(
colors
)
if
output_file
is
not
None
:
svg_tree
.
write
(
output_file
)
class
HTMLConverter
(
Converter
):
"""This class can be used to convert a 'svgWordPositions' xml file to a test HTML file.
"""
CSS
=
""" .highlight0 { background-color: yellow; opacity: 0.2; }
.highlight1 { background-color: pink; opacity: 0.2; }
.foreign { background-color: blue; opacity: 0.4; }
.word-insertion-mark { background-color: orange; opacity: 0.2; }
.deleted { background-color: grey; opacity: 0.2; }
"""
def
__init__
(
self
,
page
,
non_testing
=
True
,
show_word_insertion_mark
=
False
):
Converter
.
__init__
(
self
,
page
,
non_testing
,
show_word_insertion_mark
)
def
convert
(
self
,
output_file
=
None
,
stage_version
=
''
,
highlighted_words
=
None
):
"""Converts Page to HTML
"""
title
=
self
.
page
.
title
if
(
self
.
page
.
title
is
not
None
)
else
'Test Page'
title
=
'{}, S. {}'
.
format
(
title
,
self
.
page
.
number
)
if
(
self
.
page
.
number
is
not
None
)
else
title
if
stage_version
!=
''
:
title
=
title
+
', Schreibstufe: '
+
stage_version
if
self
.
page
.
svg_image
is
not
None
:
width
=
self
.
page
.
svg_image
.
width
height
=
self
.
page
.
svg_image
.
height
svg_file
=
self
.
page
.
svg_image
.
file_name
elif
self
.
page
.
svg_file
is
not
None
:
svg_file
=
self
.
page
.
svg_file
transkription_field
=
TranskriptionField
(
svg_file
)
width
=
transkription_field
.
getWidth
()
height
=
transkription_field
.
getHeight
()
style_content
=
' position: relative; width: {}px; height: {}px; background-image: url("{}"); background-size: {}px {}px '
\
.
format
(
width
,
height
,
path
.
abspath
(
svg_file
),
width
,
height
)
style
=
E
.
STYLE
(
'#transkription {'
+
style_content
+
'}'
,
HTMLConverter
.
CSS
)
head
=
E
.
HEAD
(
E
.
TITLE
(
title
),
E
.
META
(
charset
=
'UTF-8'
),
style
)
transkription
=
E
.
DIV
(
id
=
"transkription"
)
counter
=
0
for
word
in
self
.
page
.
words
:
highlight_class
=
'highlight'
+
str
(
counter
)
\
if
not
word
.
deleted
else
'deleted'
earlier_text
=
''
if
word
.
earlier_version
is
None
else
word
.
earlier_version
.
text
if
earlier_text
==
''
and
len
(
word
.
word_parts
)
>
0
:
earlier_versions
=
[
word
for
word
in
word
.
word_parts
if
word
.
earlier_version
is
not
None
]
earlier_text
=
earlier_versions
[
0
]
.
text
if
len
(
earlier_versions
)
>
0
else
''
if
earlier_text
!=
''
:
word_title
=
'id: {}/line: {}
\n
0: {}
\n
1: {}'
.
format
(
str
(
word
.
id
),
str
(
word
.
line_number
),
earlier_text
,
word
.
text
)
if
word
.
edited_text
is
not
None
:
word_title
+=
f
'
\n
{word.edited_text}'
else
:
word_title
=
'id: {}/line: {}
\n
{}'
.
format
(
str
(
word
.
id
),
str
(
word
.
line_number
),
word
.
text
)
for
transkription_position
in
self
.
_get_transkription_positions
(
word
.
transkription_positions
,
stage_version
=
stage_version
):
self
.
_append2transkription
(
transkription
,
highlight_class
,
word_title
,
transkription_position
)
for
part_word
in
word
.
word_parts
:
highlight_class
=
'highlight'
+
str
(
counter
)
\
if
not
part_word
.
deleted
else
'deleted'
for
part_transkription_position
in
self
.
_get_transkription_positions
(
part_word
.
transkription_positions
,
stage_version
=
stage_version
):
self
.
_append2transkription
(
transkription
,
highlight_class
,
word_title
,
part_transkription_position
)
counter
=
(
counter
+
1
)
%
2
word_insertion_mark_class
=
'word-insertion-mark'
counter
=
0
for
mark_foreign_hands
in
self
.
page
.
mark_foreign_hands
:
highlight_class
=
'foreign'
title
=
'id: {}/line: {}
\n
{} <i>{}</i>'
.
format
(
str
(
mark_foreign_hands
.
id
),
str
(
word
.
line_number
),
\
mark_foreign_hands
.
foreign_hands_text
,
mark_foreign_hands
.
pen
)
for
transkription_position
in
mark_foreign_hands
.
transkription_positions
:
self
.
_append2transkription
(
transkription
,
highlight_class
,
title
,
transkription_position
)
if
self
.
show_word_insertion_mark
:
for
word_insertion_mark
in
self
.
page
.
word_insertion_marks
:
wim_title
=
'id: {}/line: {}
\n
word insertion mark'
.
format
(
str
(
word_insertion_mark
.
id
),
str
(
word_insertion_mark
.
line_number
))
style_content
=
'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'
.
format
(
\
word_insertion_mark
.
top
,
word_insertion_mark
.
left
,
word_insertion_mark
.
width
,
word_insertion_mark
.
height
)
link
=
E
.
A
(
' '
,
E
.
CLASS
(
word_insertion_mark_class
),
title
=
wim_title
,
style
=
style_content
)
transkription
.
append
(
link
)
html
=
E
.
HTML
(
head
,
E
.
BODY
(
transkription
))
bool
(
self
.
non_testing
)
and
open_in_browser
(
html
)
if
output_file
is
not
None
:
with
open
(
output_file
,
'wb'
)
as
f
:
f
.
write
(
lxml
.
html
.
tostring
(
html
,
pretty_print
=
True
,
include_meta_content_type
=
True
,
encoding
=
'utf-8'
))
f
.
closed
def
_append2transkription
(
self
,
transkription
,
highlight_class
,
title
,
transkription_position
):
"""Append content to transkription-div.
"""
style_content
=
'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'
.
format
(
\
transkription_position
.
top
,
transkription_position
.
left
,
transkription_position
.
width
,
transkription_position
.
height
)
if
transkription_position
.
transform
is
not
None
:
style_content
=
style_content
+
' transform: {}; '
.
format
(
transkription_position
.
transform
.
toCSSTransformString
())
transform_origin_x
=
(
transkription_position
.
left
-
round
(
transkription_position
.
transform
.
getX
(),
1
))
*-
1
\
if
(
transkription_position
.
left
-
round
(
transkription_position
.
transform
.
getX
(),
1
))
*-
1
<
0
else
0
style_content
=
style_content
+
' transform-origin: {}px {}px; '
.
format
(
transform_origin_x
,
transkription_position
.
height
)
link
=
E
.
A
(
' '
,
E
.
CLASS
(
highlight_class
),
title
=
title
,
style
=
style_content
)
transkription
.
append
(
link
)
def
create_pdf_with_highlighted_words
(
xml_source_file
=
None
,
page
=
None
,
highlighted_words
=
None
,
pdf_file_name
=
'output.pdf'
,
bg_color
=
SVGConverter
.
BG_COLOR
):
"""Creates a pdf file highlighting some words.
"""
if
not
pdf_file_name
.
endswith
(
'pdf'
):
pdf_file_name
=
pdf_file_name
+
'.pdf'
tmp_svg_file
=
pdf_file_name
.
replace
(
'.pdf'
,
'.svg'
)
create_svg_with_highlighted_words
(
xml_source_file
=
xml_source_file
,
page
=
page
,
highlighted_words
=
highlighted_words
,
\
svg_file_name
=
tmp_svg_file
,
bg_color
=
bg_color
)
if
isfile
(
tmp_svg_file
):
cairosvg
.
svg2pdf
(
url
=
tmp_svg_file
,
write_to
=
pdf_file_name
)
remove
(
tmp_svg_file
)
def
create_svg_with_highlighted_words
(
xml_source_file
=
None
,
page
=
None
,
highlighted_words
=
None
,
svg_file_name
=
'output.svg'
,
bg_color
=
SVGConverter
.
BG_COLOR
):
"""Creates a svg file highlighting some words.
"""
if
page
is
None
and
xml_source_file
is
not
None
:
page
=
Page
(
xml_source_file
)
converter
=
SVGConverter
(
page
,
bg_color
=
bg_color
)
if
not
svg_file_name
.
endswith
(
'svg'
):
svg_file_name
=
svg_file_name
+
'.svg'
converter
.
convert
(
output_file
=
svg_file_name
,
highlighted_words
=
highlighted_words
)
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes.
svgscripts/convert_wordPositions.py OPTIONS <file>
OPTIONS:
-h|--help: show help
-H|--HTML [default] convert to HTML test file
-o|--output=outputFile save output to file outputFile
-P|--PDF convert to PDF test file
-S|--SVG convert to SVG test file
-s|--svg=svgFile: svg web file
-T|--TEXT convert to TEXT output
-t|--testing execute in test mode, do not write to file or open browser
-w|--word-insertion-mark show word insertion mark on HTML
-v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. }
:return: exit code (int)
"""
convert_to_type
=
None
svg_file
=
None
output_file
=
None
non_testing
=
True
show_word_insertion_mark
=
False
page
=
None
stage_version
=
''
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"htHPSTws:o:v:"
,
[
"help"
,
"testing"
,
"HTML"
,
"PDF"
,
"SVG"
,
"TEXT"
,
"word-insertion-mark"
,
"svg="
,
"output="
,
"version="
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
)
or
not
args
:
usage
()
return
0
elif
opt
in
(
'-v'
,
'--version'
):
if
re
.
match
(
r'^(\d|\d\+|\d\-\d)$'
,
arg
):
stage_version
=
arg
else
:
raise
ValueError
(
'OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'
.
format
(
arg
))
elif
opt
in
(
'-w'
,
'--word-insertion-mark'
):
show_word_insertion_mark
=
True
elif
opt
in
(
'-P'
,
'--PDF'
):
convert_to_type
=
'PDF'
elif
opt
in
(
'-S'
,
'--SVG'
):
convert_to_type
=
'SVG'
elif
opt
in
(
'-T'
,
'--TEXT'
):
convert_to_type
=
'TEXT'
elif
opt
in
(
'-H'
,
'--HTML'
):
convert_to_type
=
'HTML'
elif
opt
in
(
'-t'
,
'--testing'
):
non_testing
=
False
elif
opt
in
(
'-s'
,
'--svg'
):
svg_file
=
arg
elif
opt
in
(
'-o'
,
'--output'
):
output_file
=
arg
if
len
(
args
)
<
1
:
usage
()
return
2
if
convert_to_type
is
None
:
if
output_file
is
not
None
and
len
(
re
.
split
(
r'\.'
,
output_file
))
>
1
:
output_file_part_list
=
re
.
split
(
r'\.'
,
output_file
)
convert_to_type
=
output_file_part_list
[
len
(
output_file_part_list
)
-
1
]
.
upper
()
else
:
convert_to_type
=
'HTML'
for
word_position_file
in
args
:
if
not
isfile
(
word_position_file
):
print
(
"'{}' does not exist!"
.
format
(
word_position_file
))
return
2
if
convert_to_type
==
'PDF'
:
if
output_file
is
None
:
output_file
=
'output.pdf'
create_pdf_with_highlighted_words
(
word_position_file
,
pdf_file_name
=
output_file
)
else
:
if
svg_file
is
not
None
:
if
isfile
(
svg_file
):
page
=
PageCreator
(
word_position_file
,
svg_file
=
svg_file
)
else
:
print
(
"'{}' does not exist!"
.
format
(
word_position_file
))
return
2
else
:
page
=
Page
(
word_position_file
)
if
page
.
svg_file
is
None
:
print
(
'Please specify a svg file!'
)
usage
()
return
2
converter
=
Converter
.
CREATE_CONVERTER
(
page
,
non_testing
=
non_testing
,
converter_type
=
convert_to_type
,
show_word_insertion_mark
=
show_word_insertion_mark
)
converter
.
convert
(
output_file
=
output_file
,
stage_version
=
stage_version
)
return
0
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment