Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F61353357
convert_wordPositions.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, May 6, 03:41
Size
38 KB
Mime Type
text/x-python
Expires
Wed, May 8, 03:41 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
17499844
Attached To
rNIETZSCHEPYTHON nietzsche-python
convert_wordPositions.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to convert the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import
cairosvg
import
getopt
import
json
from
lxml.html
import
builder
as
E
from
lxml.html
import
open_in_browser
import
lxml
from
pathlib
import
Path
as
PathLibPath
from
os
import
sep
,
listdir
,
mkdir
,
path
,
remove
from
os.path
import
exists
,
isfile
,
isdir
,
dirname
import
re
import
sys
from
svgpathtools
import
svg_to_paths
import
xml.etree.ElementTree
as
ET
if
dirname
(
__file__
)
not
in
sys
.
path
:
sys
.
path
.
append
(
dirname
(
__file__
))
from
datatypes.matrix
import
Matrix
from
datatypes.page
import
Page
from
datatypes.page_creator
import
PageCreator
from
datatypes.transkriptionField
import
TranskriptionField
from
datatypes.text_field
import
TextField
from
datatypes.writing_process
import
WritingProcess
from
datatypes.word
import
Word
sys
.
path
.
append
(
'shared_util'
)
from
main_util
import
extract_paths_on_tf
,
get_paths_near_position
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
EXIST_DB
=
'http://existdb-test.dasch.swiss/exist/rest/db/storage/nietzsche/'
LOCAL_SERVER
=
'http://localhost:8000/'
class
Converter
:
"""The converter super class.
"""
def
__init__
(
self
,
page
,
non_testing
=
True
,
show_word_insertion_mark
=
False
):
self
.
page
=
page
self
.
non_testing
=
non_testing
self
.
show_word_insertion_mark
=
show_word_insertion_mark
def
_get_transkription_positions
(
self
,
transkription_positions
,
stage_version
=
''
):
"""Returns the transkription_positions of the indicated stage_version.
"""
convertable_transkription_positions
=
transkription_positions
if
stage_version
!=
''
:
convertable_transkription_positions
=
[]
if
re
.
match
(
r'^\d$'
,
stage_version
):
writing_process_id
=
int
(
stage_version
)
for
transkription_position
in
transkription_positions
:
if
transkription_position
.
writing_process_id
==
writing_process_id
:
convertable_transkription_positions
.
append
(
transkription_position
)
elif
re
.
match
(
r'^\d\+$'
,
stage_version
):
version_range
=
[
*
range
(
int
(
stage_version
.
replace
(
'+'
,
''
)),
len
(
WritingProcess
.
VERSION_DESCRIPTION
))
]
for
transkription_position
in
transkription_positions
:
if
transkription_position
.
writing_process_id
in
version_range
:
convertable_transkription_positions
.
append
(
transkription_position
)
elif
re
.
match
(
r'^\d\-\d$'
,
stage_version
):
start_stop
=
[
int
(
i
)
for
i
in
re
.
split
(
r'-'
,
stage_version
)
]
version_range
=
[
*
range
(
start_stop
[
0
],
start_stop
[
1
]
+
1
)
]
for
transkription_position
in
transkription_positions
:
if
transkription_position
.
writing_process_id
in
version_range
:
convertable_transkription_positions
.
append
(
transkription_position
)
return
convertable_transkription_positions
def
_get_words
(
self
,
words
,
highlighted_words
=
None
):
"""Return the words that will be hightlighted.
"""
return
highlighted_words
if
highlighted_words
is
not
None
else
words
def
convert
(
self
,
output_file
=
None
,
stage_version
=
''
,
highlighted_words
=
None
):
"""Prints all words.
"""
first_word_of_line
=
None
out
=
sys
.
stdout
if
output_file
is
not
None
:
out
=
open
(
output_file
,
'w'
)
for
word
in
self
.
page
.
words
:
if
first_word_of_line
is
None
or
first_word_of_line
.
line_number
!=
word
.
line_number
:
out
.
write
(
'
\n
'
)
first_word_of_line
=
word
if
word
.
line_number
%
2
==
0
:
out
.
write
(
str
(
word
.
line_number
)
.
zfill
(
2
)
+
' '
)
else
:
out
.
write
(
' '
)
if
stage_version
==
''
or
len
(
self
.
_get_transkription_positions
(
word
.
transkription_positions
,
stage_version
=
stage_version
))
>
0
:
if
word
.
text
is
not
None
:
out
.
write
(
word
.
text
+
' '
)
out
.
close
()
return
0
@classmethod
def
CREATE_CONVERTER
(
cls
,
page
,
non_testing
=
True
,
converter_type
=
''
,
show_word_insertion_mark
=
False
,
key
=
''
):
"""Returns a converter of type converter_type.
[:return:] SVGConverter for 'SVG', HTMLConverter for 'HTML', Converter for None
"""
cls_dict
=
{
subclass
.
__name__
:
subclass
for
subclass
in
cls
.
__subclasses__
()
}
cls_key
=
converter_type
+
'Converter'
if
bool
(
cls_dict
.
get
(
cls_key
)):
converter_cls
=
cls_dict
[
cls_key
]
if
converter_cls
==
JSONConverter
:
return
converter_cls
(
page
,
non_testing
=
non_testing
,
key
=
key
)
return
converter_cls
(
page
,
non_testing
,
show_word_insertion_mark
)
else
:
return
Converter
(
page
,
non_testing
,
show_word_insertion_mark
)
class
JSONConverter
(
Converter
):
"""This class can be used to convert a 'svgWordPositions' xml file to a json file.
"""
def
__init__
(
self
,
page
,
faksimile_page
=
None
,
non_testing
=
True
,
key
=
''
):
Converter
.
__init__
(
self
,
page
,
non_testing
,
False
)
self
.
faksimile_page
=
faksimile_page
def
_add_word_to_list
(
self
,
words
,
word
,
text
,
text_field
=
None
,
edited_text
=
None
,
earlier_version
=
None
,
overwrites_word
=
None
,
parent_id
=-
1
,
faksimile_positions
=
None
):
"""Add word to list.
"""
id
=
word
.
id
\
if
parent_id
==
-
1
\
else
parent_id
edited_text
=
word
.
edited_text
\
if
edited_text
is
None
\
else
edited_text
earlier_version
=
word
.
earlier_version
\
if
earlier_version
is
None
\
else
earlier_version
overwrites_word
=
word
.
overwrites_word
\
if
overwrites_word
is
None
\
else
overwrites_word
line_number
=
word
.
line_number
for
tp
in
word
.
transkription_positions
:
tp_id
=
f
'w{word.id}:tp{tp.id}'
\
if
parent_id
==
-
1
\
else
f
'w{parent_id}:w{word.id}:tp{tp.id}'
if
text_field
is
not
None
:
word_dict
=
{
'id'
:
id
,
'text'
:
text
,
'left'
:
tp
.
left
+
text_field
.
left
,
'top'
:
tp
.
top
+
text_field
.
top
,
\
'width'
:
tp
.
width
,
'height'
:
tp
.
height
,
'line'
:
line_number
,
'tp_id'
:
tp_id
,
'deleted'
:
word
.
deleted
}
if
tp
.
transform
is
not
None
:
matrix
=
tp
.
transform
.
clone_transformation_matrix
()
xmin
=
text_field
.
left
ymin
=
text_field
.
top
matrix
.
matrix
[
Matrix
.
XINDEX
]
=
round
(
tp
.
transform
.
matrix
[
Matrix
.
XINDEX
]
+
xmin
,
3
)
matrix
.
matrix
[
Matrix
.
YINDEX
]
=
round
(
tp
.
transform
.
matrix
[
Matrix
.
YINDEX
]
+
ymin
,
3
)
word_dict
.
update
({
'transform'
:
matrix
.
toString
()
})
if
tp
.
left
>
0
:
word_dict
.
update
({
'left'
:
round
(
tp
.
left
-
tp
.
transform
.
matrix
[
Matrix
.
XINDEX
],
3
)})
else
:
word_dict
.
update
({
'left'
:
0
})
word_dict
.
update
({
'top'
:
round
((
tp
.
height
-
1.5
)
*-
1
,
3
)})
else
:
word_dict
=
{
'id'
:
id
,
'text'
:
text
,
'left'
:
tp
.
left
,
'top'
:
tp
.
top
,
'width'
:
tp
.
width
,
\
'height'
:
tp
.
height
,
'line'
:
line_number
,
'tp_id'
:
tp_id
,
'deleted'
:
word
.
deleted
}
if
tp
.
transform
is
not
None
:
word_dict
.
update
({
'transform'
:
tp
.
transform
.
toString
()
})
if
edited_text
is
not
None
:
word_dict
.
update
({
'edited_text'
:
edited_text
})
if
earlier_version
is
not
None
:
word_dict
.
update
({
'earlier_version'
:
earlier_version
.
text
})
if
overwrites_word
is
not
None
:
word_dict
.
update
({
'overwrites_word'
:
overwrites_word
.
text
})
if
parent_id
>
-
1
:
word_dict
.
update
({
'part_text'
:
word
.
text
})
if
len
(
word
.
deletion_paths
)
>
0
:
for
dp_index
,
dp
in
enumerate
(
word
.
deletion_paths
):
if
bool
(
word_dict
.
get
(
'deletion_path'
)):
word_dict
=
word_dict
.
copy
()
word_dict
.
update
({
'deletion_path'
:
dp
.
d_attribute
})
words
.
append
(
word_dict
)
if
len
(
word
.
deletion_paths_near_word
)
>
0
:
word_dict
.
update
({
'paths_near_word'
:
word
.
deletion_paths_near_word
})
words
.
append
(
word_dict
)
else
:
words
.
append
(
word_dict
)
if
faksimile_positions
is
not
None
:
faksimile_dict
=
{}
for
fp
in
word
.
faksimile_positions
:
self
.
_add_faksimile_to_list
(
id
,
line_number
,
fp
,
word
.
deleted
,
faksimile_positions
,
text
,
edited_text
=
edited_text
,
\
earlier_version
=
earlier_version
,
overwrites_word
=
overwrites_word
,
parent_id
=
parent_id
,
word_text
=
word
.
text
)
for
wp
in
word
.
word_parts
:
self
.
_add_word_to_list
(
words
,
wp
,
text
,
text_field
=
text_field
,
edited_text
=
edited_text
,
\
earlier_version
=
earlier_version
,
overwrites_word
=
overwrites_word
,
parent_id
=
word
.
id
,
faksimile_positions
=
faksimile_positions
)
def
_add_faksimile_to_list
(
self
,
id
,
line_number
,
fp
,
deleted
,
faksimile_positions
,
text
,
edited_text
=
None
,
earlier_version
=
None
,
overwrites_word
=
None
,
parent_id
=-
1
,
word_text
=
''
)
->
dict
:
"""Create and return a json dictionary.
"""
faksimile_dict
=
{
'id'
:
id
,
'text'
:
text
,
'left'
:
fp
.
left
,
'top'
:
fp
.
top
,
\
'width'
:
fp
.
width
,
'height'
:
fp
.
height
,
'line'
:
line_number
,
'fp_id'
:
fp
.
id
,
'deleted'
:
deleted
}
if
fp
.
transform
is
not
None
:
faksimile_dict
.
update
({
'transform'
:
fp
.
transform
.
toString
()
})
if
len
(
faksimile_dict
)
>
0
:
if
edited_text
is
not
None
:
faksimile_dict
.
update
({
'edited_text'
:
edited_text
})
if
earlier_version
is
not
None
:
faksimile_dict
.
update
({
'earlier_version'
:
earlier_version
.
text
})
if
overwrites_word
is
not
None
:
faksimile_dict
.
update
({
'overwrites_word'
:
overwrites_word
.
text
})
if
parent_id
>
-
1
:
faksimile_dict
.
update
({
'part_text'
:
word_text
})
faksimile_positions
.
append
(
faksimile_dict
)
def
create_json_dict
(
self
)
->
dict
:
"""Create and return a json dictionary.
"""
words
=
[]
faksimile_positions
=
[]
text_field
=
None
if
self
.
page
.
svg_image
is
not
None
:
if
self
.
page
.
svg_image
.
text_field
is
None
:
text_field
=
self
.
page
.
svg_image
.
text_field
=
TranskriptionField
(
self
.
page
.
svg_image
.
file_name
)
.
convert_to_text_field
()
for
word
in
self
.
page
.
words
:
self
.
_add_word_to_list
(
words
,
word
,
word
.
text
,
text_field
=
text_field
,
faksimile_positions
=
faksimile_positions
)
lines
=
[]
faksimile_lines
=
[]
offset
=
0
if
text_field
is
None
else
text_field
.
ymin
svg_image
=
self
.
add_object2dict
(
self
.
page
.
svg_image
)
if
self
.
faksimile_page
is
not
None
:
if
self
.
page
.
faksimile_image
is
None
:
if
self
.
faksimile_page
.
faksimile_image
.
text_field
is
None
\
and
self
.
faksimile_page
.
text_field
is
not
None
:
self
.
faksimile_page
.
faksimile_image
.
text_field
=
self
.
faksimile_page
.
text_field
self
.
page
.
faksimile_image
=
self
.
faksimile_page
.
faksimile_image
for
fp
in
self
.
faksimile_page
.
word_positions
:
if
fp
.
id
not
in
[
f_dict
.
get
(
'fp_id'
)
for
f_dict
in
faksimile_positions
]:
self
.
_add_faksimile_to_list
(
fp
.
id
,
-
1
,
fp
,
False
,
faksimile_positions
,
fp
.
text
)
faksimile_image
=
self
.
add_object2dict
(
self
.
page
.
faksimile_image
)
if
svg_image
is
not
None
:
svg_image
.
update
({
'URL'
:
self
.
page
.
svg_image
.
primaryURL
})
svg_image
.
update
({
'x'
:
self
.
page
.
svg_image
.
text_field
.
left
})
svg_image
.
update
({
'y'
:
self
.
page
.
svg_image
.
text_field
.
top
})
if
faksimile_image
is
not
None
:
faksimile_image
.
update
({
'secondaryURL'
:
LOCAL_SERVER
+
"faksimiles/"
+
self
.
page
.
faksimile_image
.
file_name
})
faksimile_image
.
update
({
'x'
:
0
})
faksimile_image
.
update
({
'y'
:
0
})
for
line
in
self
.
page
.
lines
:
lines
.
append
({
'id'
:
line
.
id
,
'number'
:
line
.
id
,
'top'
:
line
.
top
+
offset
,
'bottom'
:
line
.
bottom
})
faksimile_lines
.
append
({
'id'
:
line
.
id
,
'number'
:
line
.
id
,
'top'
:
line
.
faksimile_inner_top
,
'bottom'
:
line
.
faksimile_inner_bottom
})
return
{
'title'
:
self
.
page
.
title
,
'number'
:
self
.
page
.
number
,
'words'
:
words
,
'svg'
:
svg_image
,
'lines'
:
lines
,
\
'faksimile'
:
faksimile_image
,
'faksimile_positions'
:
faksimile_positions
,
'faksimile_lines'
:
faksimile_lines
}
def
convert
(
self
,
output_file
=
None
,
stage_version
=
''
,
highlighted_words
=
None
):
"""Converts Page to JSON.
"""
if
output_file
is
None
:
output_file
=
'output.json'
json_file
=
open
(
output_file
,
"w+"
)
try
:
json
.
dump
(
self
.
create_json_dict
(),
json_file
)
except
Exception
:
raise
Exception
(
'Error in json.dump'
)
json_file
.
close
()
return
0
def
add_object2dict
(
self
,
object_instance
):
"""Add an object to json_dict and generate json data and interfaces.
[:return:] json dict or object_instance
"""
json_dict
=
{}
object_type
=
type
(
object_instance
)
if
object_type
.
__module__
==
'builtins'
:
if
object_type
!=
list
:
return
object_instance
else
:
items
=
[]
for
item
in
object_instance
:
items
.
append
(
self
.
add_object2dict
(
item
))
if
len
(
items
)
>
0
:
return
items
else
:
return
{
self
.
key
:
[]
}
semantic_dictionary
=
object_type
.
get_semantic_dictionary
()
for
key
,
content_type
in
[
(
key
,
content
.
get
(
'class'
))
for
key
,
content
in
semantic_dictionary
[
'properties'
]
.
items
()]:
content
=
object_instance
.
__dict__
.
get
(
key
)
if
content_type
==
list
\
and
content
is
not
None
\
and
len
(
content
)
>
0
\
and
type
(
content
[
0
])
.
__module__
!=
'builtins'
:
content_list
=
[]
for
content_item
in
content
:
content_list
.
append
(
self
.
add_object2dict
(
content_item
))
json_dict
.
update
({
key
:
content_list
})
elif
content_type
.
__module__
==
'builtins'
:
if
content
is
not
None
:
json_dict
.
update
({
key
:
content
})
else
:
if
content
is
not
None
and
type
(
content
)
==
list
:
content_list
=
[]
for
content_item
in
content
:
content_list
.
append
(
self
.
add_object2dict
(
content_item
))
json_dict
.
update
({
key
:
content_list
})
else
:
if
content
is
not
None
:
json_dict
.
update
({
key
:
self
.
add_object2dict
(
content
)})
return
json_dict
class
oldJSONConverter
(
Converter
):
"""This class can be used to convert a 'svgWordPositions' xml file to a json file.
"""
PY2TS_DICT
=
{
float
:
'number'
,
int
:
'number'
,
bool
:
'boolean'
,
str
:
'string'
}
def
__init__
(
self
,
page
,
non_testing
=
True
,
key
=
''
):
Converter
.
__init__
(
self
,
page
,
non_testing
,
False
)
self
.
key
=
key
self
.
interface_output_dir
=
PathLibPath
(
'ts_interfaces'
)
if
not
self
.
interface_output_dir
.
is_dir
():
self
.
interface_output_dir
.
mkdir
()
elif
len
(
list
(
self
.
interface_output_dir
.
glob
(
'*.ts'
)))
>
0
:
for
ts_file
in
self
.
interface_output_dir
.
glob
(
'*.ts'
):
remove
(
ts_file
)
def
convert
(
self
,
output_file
=
None
,
stage_version
=
''
,
highlighted_words
=
None
):
"""Converts Page to JSON.
"""
if
output_file
is
None
:
output_file
=
'output.json'
class_dict
=
{}
if
self
.
key
!=
''
:
object_instance
=
self
.
page
.
__dict__
.
get
(
self
.
key
)
if
object_instance
is
not
None
:
json_dict
=
self
.
add_object2dict
(
object_instance
,
class_dict
)
if
type
(
json_dict
)
==
list
:
json_dict
=
{
self
.
key
:
json_dict
}
else
:
print
(
f
'Page initialized from {self.page.page_tree.docinfo.URL} does not have an object at "{self.key}"!'
)
return
2
else
:
json_dict
=
self
.
add_object2dict
(
self
.
page
,
class_dict
)
json_file
=
open
(
output_file
,
"w+"
)
try
:
json
.
dump
(
json_dict
,
json_file
)
except
Exception
:
raise
Exception
(
'Error in json.dump'
)
json_file
.
close
()
self
.
create_imports
(
class_dict
)
return
0
def
add_object2dict
(
self
,
object_instance
,
class_dict
):
"""Add an object to json_dict and generate json data and interfaces.
[:return:] json dict or object_instance
"""
json_dict
=
{}
interface_list
=
[]
object_type
=
type
(
object_instance
)
if
object_type
.
__module__
==
'builtins'
:
if
object_type
!=
list
:
return
object_instance
else
:
items
=
[]
for
item
in
object_instance
:
items
.
append
(
self
.
add_object2dict
(
item
,
class_dict
))
if
len
(
items
)
>
0
:
return
{
self
.
key
:
items
}
else
:
return
{
self
.
key
:
'null'
}
semantic_dictionary
=
object_type
.
get_semantic_dictionary
()
for
key
,
content_type
in
[
(
key
,
content
.
get
(
'class'
))
for
key
,
content
in
semantic_dictionary
[
'properties'
]
.
items
()]:
content
=
object_instance
.
__dict__
.
get
(
key
)
if
content_type
==
list
\
and
content
is
not
None
\
and
len
(
content
)
>
0
\
and
type
(
content
[
0
])
.
__module__
!=
'builtins'
:
content_list
=
[]
for
content_item
in
content
:
content_list
.
append
(
self
.
add_object2dict
(
content_item
,
class_dict
))
json_dict
.
update
({
key
:
content_list
})
interface_list
.
append
(
f
'{key}: {type(content[0]).__name__}[];'
)
elif
content_type
.
__module__
==
'builtins'
:
if
content_type
!=
list
:
ts_type
=
self
.
PY2TS_DICT
[
content_type
]
\
if
content_type
in
self
.
PY2TS_DICT
.
keys
()
\
else
'string'
interface_list
.
append
(
f
'{key}: {ts_type};'
)
json_dict
.
update
({
key
:
content
})
else
:
if
content
is
not
None
and
type
(
content
)
==
list
:
interface_list
.
append
(
f
'{key}: {content_type.__name__}[];'
)
content_list
=
[]
for
content_item
in
content
:
content_list
.
append
(
self
.
add_object2dict
(
content_item
,
class_dict
))
json_dict
.
update
({
key
:
content_list
})
else
:
interface_list
.
append
(
f
'{key}: {content_type.__name__};'
)
if
content
is
not
None
:
json_dict
.
update
({
key
:
self
.
add_object2dict
(
content
,
class_dict
)})
if
object_type
not
in
class_dict
.
keys
():
class_dict
.
update
({
object_type
:
self
.
create_interface
(
object_type
.
__name__
,
interface_list
)})
return
json_dict
def
create_imports
(
self
,
class_dict
):
"""Create an ts interface from a list of key and content_types.
[:return:] file_name of interface
"""
ts_file
=
PathLibPath
(
'ts_imports.ts'
)
file
=
open
(
ts_file
,
"w+"
)
file
.
write
(
f
'//import all interfaces from {self.interface_output_dir} '
+
'
\n
'
)
for
interface_name
,
path_name
in
class_dict
.
items
()
:
file
.
write
(
'import {'
+
interface_name
.
__name__
+
'} from
\'
./'
+
str
(
self
.
interface_output_dir
.
joinpath
(
path_name
.
stem
))
+
'
\'
;
\n
'
)
file
.
close
()
return
ts_file
def
create_interface
(
self
,
class_name
,
interface_list
)
->
PathLibPath
:
"""Create an ts interface from a list of key and content_types.
[:return:] file_name of interface
"""
ts_file
=
self
.
interface_output_dir
.
joinpath
(
PathLibPath
(
f
'{class_name.lower()}.ts'
))
import_list
=
[
import_class_name
for
import_class_name
in
\
[
import_class_name
.
split
(
': '
)[
1
]
.
replace
(
';'
,
''
)
.
replace
(
'[]'
,
''
)
for
import_class_name
in
interface_list
]
\
if
import_class_name
not
in
set
(
self
.
PY2TS_DICT
.
values
())
]
file
=
open
(
ts_file
,
"w"
)
for
import_class_name
in
set
(
import_list
):
file
.
write
(
'import {'
+
import_class_name
+
'} from
\'
./'
+
import_class_name
.
lower
()
+
'
\'
;
\n
'
)
file
.
write
(
f
'export interface {class_name} '
+
'{
\n
'
)
for
interace_string
in
interface_list
:
file
.
write
(
f
'
\t
'
+
interace_string
+
'
\n
'
)
file
.
write
(
'}'
)
file
.
close
()
return
ts_file
class
SVGConverter
(
Converter
):
"""This class can be used to convert a 'svgWordPositions' xml file to a svg file that combines text as path and text-as-text.
"""
BG_COLOR
=
'yellow'
OPACITY
=
'0.2'
def
__init__
(
self
,
page
,
non_testing
=
True
,
show_word_insertion_mark
=
False
,
bg_color
=
BG_COLOR
,
opacity
=
OPACITY
):
Converter
.
__init__
(
self
,
page
,
non_testing
,
show_word_insertion_mark
)
self
.
bg_color
=
bg_color
self
.
opacity
=
opacity
def
convert
(
self
,
output_file
=
None
,
stage_version
=
''
,
highlighted_words
=
None
):
"""Converts Page to SVG
"""
title
=
self
.
page
.
title
if
(
self
.
page
.
title
is
not
None
)
else
'Test Page'
title
=
'{}, S. {}'
.
format
(
title
,
self
.
page
.
number
)
if
(
self
.
page
.
number
is
not
None
)
else
title
svg_file
=
self
.
page
.
svg_file
if
svg_file
is
None
and
self
.
page
.
svg_image
is
not
None
:
svg_file
=
self
.
page
.
svg_image
.
file_name
elif
svg_file
is
None
:
msg
=
f
'ERROR: xml_source_file {self.page.docinfo.URL} does neither have a svg_file nor a svg_image!'
raise
Exception
(
msg
)
transkription_field
=
TranskriptionField
(
svg_file
)
if
bool
(
transkription_field
.
get_svg_attributes
(
'xmlns'
)):
ET
.
register_namespace
(
''
,
transkription_field
.
get_svg_attributes
(
'xmlns'
))
if
bool
(
transkription_field
.
get_svg_attributes
(
'xmlns:xlink'
)):
ET
.
register_namespace
(
'xlink'
,
transkription_field
.
get_svg_attributes
(
'xmlns:xlink'
))
svg_tree
=
ET
.
parse
(
svg_file
)
transkription_node
=
ET
.
SubElement
(
svg_tree
.
getroot
(),
'g'
,
attrib
=
{
'id'
:
'Transkription'
})
colors
=
[
'yellow'
,
'orange'
]
if
self
.
bg_color
==
self
.
BG_COLOR
else
[
self
.
bg_color
]
if
highlighted_words
is
not
None
:
colors
=
[
'yellow'
]
else
:
highlighted_words
=
[]
color_index
=
0
for
word
in
self
.
page
.
words
:
word_id
=
'word_'
+
str
(
word
.
id
)
for
transkription_position
in
self
.
_get_transkription_positions
(
word
.
transkription_positions
,
stage_version
=
stage_version
):
transkription_position_id
=
word_id
+
'_'
+
str
(
transkription_position
.
id
)
color
=
colors
[
color_index
]
if
word
not
in
highlighted_words
else
self
.
bg_color
rect_node
=
ET
.
SubElement
(
transkription_node
,
'rect'
,
\
attrib
=
{
'id'
:
transkription_position_id
,
'x'
:
str
(
transkription_position
.
left
+
transkription_field
.
xmin
),
\
'y'
:
str
(
transkription_position
.
top
+
transkription_field
.
ymin
),
'width'
:
str
(
transkription_position
.
width
),
\
'height'
:
str
(
transkription_position
.
height
),
'fill'
:
color
,
'opacity'
:
self
.
opacity
})
if
transkription_position
.
transform
is
not
None
:
matrix
=
transkription_position
.
transform
.
clone_transformation_matrix
()
matrix
.
matrix
[
Matrix
.
XINDEX
]
=
round
(
transkription_position
.
transform
.
matrix
[
Matrix
.
XINDEX
]
+
transkription_field
.
xmin
,
3
)
matrix
.
matrix
[
Matrix
.
YINDEX
]
=
round
(
transkription_position
.
transform
.
matrix
[
Matrix
.
YINDEX
]
+
transkription_field
.
ymin
,
3
)
rect_node
.
set
(
'transform'
,
matrix
.
toString
())
rect_node
.
set
(
'x'
,
str
(
round
(
transkription_position
.
left
-
transkription_position
.
transform
.
matrix
[
Matrix
.
XINDEX
],
3
)))
rect_node
.
set
(
'y'
,
str
(
round
((
transkription_position
.
height
-
1.5
)
*-
1
,
3
)))
ET
.
SubElement
(
rect_node
,
'title'
)
.
text
=
word
.
text
color_index
=
(
color_index
+
1
)
%
len
(
colors
)
if
output_file
is
not
None
:
svg_tree
.
write
(
output_file
)
return
0
class
HTMLConverter
(
Converter
):
"""This class can be used to convert a 'svgWordPositions' xml file to a test HTML file.
"""
CSS
=
""" .highlight0 { background-color: yellow; opacity: 0.2; }
.highlight1 { background-color: pink; opacity: 0.2; }
.highlight2 { background-color: red; opacity: 0.2; }
.foreign { background-color: blue; opacity: 0.4; }
.overwritten { background-color: green; opacity: 0.4; }
.word-insertion-mark { background-color: orange; opacity: 0.2; }
.deleted { background-color: grey; opacity: 0.2; }
"""
def
__init__
(
self
,
page
,
non_testing
=
True
,
show_word_insertion_mark
=
False
):
Converter
.
__init__
(
self
,
page
,
non_testing
,
show_word_insertion_mark
)
self
.
text_field
=
TextField
()
def
convert
(
self
,
output_file
=
None
,
stage_version
=
''
,
highlighted_words
=
None
):
"""Converts Page to HTML
"""
title
=
self
.
page
.
title
if
(
self
.
page
.
title
is
not
None
)
else
'Test Page'
title
=
'{}, S. {}'
.
format
(
title
,
self
.
page
.
number
)
if
(
self
.
page
.
number
is
not
None
)
else
title
if
stage_version
!=
''
:
title
=
title
+
', Schreibstufe: '
+
stage_version
if
self
.
page
.
svg_image
is
not
None
:
width
=
self
.
page
.
svg_image
.
width
height
=
self
.
page
.
svg_image
.
height
svg_file
=
self
.
page
.
svg_image
.
file_name
if
self
.
page
.
svg_image
.
text_field
is
not
None
:
self
.
text_field
=
self
.
page
.
svg_image
.
text_field
print
(
'Textfield found ->adjusting data'
)
elif
self
.
page
.
svg_file
is
not
None
:
svg_file
=
self
.
page
.
svg_file
transkription_field
=
TranskriptionField
(
svg_file
)
width
=
transkription_field
.
getWidth
()
height
=
transkription_field
.
getHeight
()
style_content
=
' position: relative; width: {}px; height: {}px; background-image: url("{}"); background-size: {}px {}px '
\
.
format
(
width
,
height
,
path
.
abspath
(
svg_file
),
width
,
height
)
style
=
E
.
STYLE
(
'#transkription {'
+
style_content
+
'}'
,
HTMLConverter
.
CSS
)
head
=
E
.
HEAD
(
E
.
TITLE
(
title
),
E
.
META
(
charset
=
'UTF-8'
),
style
)
transkription
=
E
.
DIV
(
id
=
"transkription"
)
counter
=
0
for
word
in
self
.
page
.
words
:
highlight_class
=
'highlight'
+
str
(
counter
)
\
if
not
word
.
deleted
else
'deleted'
if
highlighted_words
is
not
None
\
and
word
in
highlighted_words
:
highlight_class
=
'highlight2'
earlier_text
=
''
if
word
.
earlier_version
is
None
else
word
.
earlier_version
.
text
if
earlier_text
==
''
and
len
(
word
.
word_parts
)
>
0
:
earlier_versions
=
[
word
for
word
in
word
.
word_parts
if
word
.
earlier_version
is
not
None
]
earlier_text
=
earlier_versions
[
0
]
.
text
if
len
(
earlier_versions
)
>
0
else
''
if
earlier_text
!=
''
:
word_title
=
'id: {}/line: {}
\n
0: {}
\n
1: {}'
.
format
(
str
(
word
.
id
),
str
(
word
.
line_number
),
earlier_text
,
word
.
text
)
else
:
word_title
=
'id: {}/line: {}
\n
{}'
.
format
(
str
(
word
.
id
),
str
(
word
.
line_number
),
word
.
text
)
if
word
.
edited_text
is
not
None
:
word_title
+=
f
'
\n
>{word.edited_text}'
for
transkription_position
in
self
.
_get_transkription_positions
(
word
.
transkription_positions
,
stage_version
=
stage_version
):
self
.
_append2transkription
(
transkription
,
highlight_class
,
word_title
,
transkription_position
)
if
word
.
overwrites_word
is
not
None
:
overwritten_title
=
f
'{word.text} overwrites {word.overwrites_word.text}'
for
overwritten_transkription_position
in
word
.
overwrites_word
.
transkription_positions
:
self
.
_append2transkription
(
transkription
,
'overwritten'
,
overwritten_title
,
overwritten_transkription_position
)
for
part_word
in
word
.
word_parts
:
highlight_class
=
'highlight'
+
str
(
counter
)
\
if
not
part_word
.
deleted
else
'deleted'
for
part_transkription_position
in
self
.
_get_transkription_positions
(
part_word
.
transkription_positions
,
stage_version
=
stage_version
):
self
.
_append2transkription
(
transkription
,
highlight_class
,
word_title
,
part_transkription_position
)
if
part_word
.
overwrites_word
is
not
None
:
overwritten_title
=
f
'{word.text} overwrites {part_word.overwrites_word.text}'
for
overwritten_transkription_position
in
part_word
.
overwrites_word
.
transkription_positions
:
self
.
_append2transkription
(
transkription
,
'overwritten'
,
overwritten_title
,
overwritten_transkription_position
)
counter
=
(
counter
+
1
)
%
2
word_insertion_mark_class
=
'word-insertion-mark'
counter
=
0
for
mark_foreign_hands
in
self
.
page
.
mark_foreign_hands
:
highlight_class
=
'foreign'
title
=
'id: {}/line: {}
\n
{} <i>{}</i>'
.
format
(
str
(
mark_foreign_hands
.
id
),
str
(
mark_foreign_hands
.
line_number
),
\
mark_foreign_hands
.
foreign_hands_text
,
mark_foreign_hands
.
pen
)
for
transkription_position
in
mark_foreign_hands
.
transkription_positions
:
self
.
_append2transkription
(
transkription
,
highlight_class
,
title
,
transkription_position
)
if
self
.
show_word_insertion_mark
:
for
word_insertion_mark
in
self
.
page
.
word_insertion_marks
:
wim_title
=
'id: {}/line: {}
\n
word insertion mark'
.
format
(
str
(
word_insertion_mark
.
id
),
str
(
word_insertion_mark
.
line_number
))
style_content
=
'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'
.
format
(
\
word_insertion_mark
.
top
,
word_insertion_mark
.
left
,
word_insertion_mark
.
width
,
word_insertion_mark
.
height
)
link
=
E
.
A
(
' '
,
E
.
CLASS
(
word_insertion_mark_class
),
title
=
wim_title
,
style
=
style_content
)
transkription
.
append
(
link
)
html
=
E
.
HTML
(
head
,
E
.
BODY
(
transkription
))
bool
(
self
.
non_testing
)
and
open_in_browser
(
html
)
if
output_file
is
not
None
:
with
open
(
output_file
,
'wb'
)
as
f
:
f
.
write
(
lxml
.
html
.
tostring
(
html
,
pretty_print
=
True
,
include_meta_content_type
=
True
,
encoding
=
'utf-8'
))
f
.
closed
return
0
def
_append2transkription
(
self
,
transkription
,
highlight_class
,
title
,
transkription_position
):
"""Append content to transkription-div.
"""
style_content
=
'position:absolute; top:{0}px; left:{1}px; width:{2}px; height:{3}px;'
.
format
(
\
transkription_position
.
top
-
self
.
text_field
.
top
,
transkription_position
.
left
-
self
.
text_field
.
left
,
transkription_position
.
width
,
transkription_position
.
height
)
if
transkription_position
.
transform
is
not
None
:
style_content
=
style_content
+
' transform: {}; '
.
format
(
transkription_position
.
transform
.
toCSSTransformString
())
transform_origin_x
=
(
transkription_position
.
left
-
round
(
transkription_position
.
transform
.
getX
(),
1
))
*-
1
\
if
(
transkription_position
.
left
-
round
(
transkription_position
.
transform
.
getX
(),
1
))
*-
1
<
0
else
0
style_content
=
style_content
+
' transform-origin: {}px {}px; '
.
format
(
transform_origin_x
,
transkription_position
.
height
)
link
=
E
.
A
(
' '
,
E
.
CLASS
(
highlight_class
),
title
=
title
,
style
=
style_content
)
transkription
.
append
(
link
)
def
create_pdf_with_highlighted_words
(
xml_source_file
=
None
,
page
=
None
,
highlighted_words
=
None
,
pdf_file_name
=
'output.pdf'
,
bg_color
=
SVGConverter
.
BG_COLOR
):
"""Creates a pdf file highlighting some words.
"""
if
not
pdf_file_name
.
endswith
(
'pdf'
):
pdf_file_name
=
pdf_file_name
+
'.pdf'
tmp_svg_file
=
pdf_file_name
.
replace
(
'.pdf'
,
'.svg'
)
create_svg_with_highlighted_words
(
xml_source_file
=
xml_source_file
,
page
=
page
,
highlighted_words
=
highlighted_words
,
\
svg_file_name
=
tmp_svg_file
,
bg_color
=
bg_color
)
if
isfile
(
tmp_svg_file
):
cairosvg
.
svg2pdf
(
url
=
tmp_svg_file
,
write_to
=
pdf_file_name
)
remove
(
tmp_svg_file
)
def
create_svg_with_highlighted_words
(
xml_source_file
=
None
,
page
=
None
,
highlighted_words
=
None
,
svg_file_name
=
'output.svg'
,
bg_color
=
SVGConverter
.
BG_COLOR
):
"""Creates a svg file highlighting some words.
"""
if
page
is
None
and
xml_source_file
is
not
None
:
page
=
Page
(
xml_source_file
)
converter
=
SVGConverter
(
page
,
bg_color
=
bg_color
)
if
not
svg_file_name
.
endswith
(
'svg'
):
svg_file_name
=
svg_file_name
+
'.svg'
converter
.
convert
(
output_file
=
svg_file_name
,
highlighted_words
=
highlighted_words
)
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to convert the word positions to HTML, SVG or TEXT for testing purposes.
svgscripts/convert_wordPositions.py OPTIONS <file>
OPTIONS:
-h|--help: show help
-H|--HTML [default] convert to HTML test file
-k|--key=key option for json converter:
only convert object == page.__dict__[key]
-o|--output=outputFile save output to file outputFile
-P|--PDF convert to PDF test file
-S|--SVG convert to SVG test file
-s|--svg=svgFile: svg web file
-T|--TEXT convert to TEXT output
-t|--text=text highlight word
-w|--word-insertion-mark show word insertion mark on HTML
-v|--version=VERSION show words that belong to writing process VERSION: { 0, 1, 2, 0-1, 0+, etc. }
-x|--testing execute in test mode, do not write to file or open browser
:return: exit code (int)
"""
convert_to_type
=
None
key
=
''
non_testing
=
True
output_file
=
None
page
=
None
show_word_insertion_mark
=
False
stage_version
=
''
svg_file
=
None
text
=
None
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hk:t:HPSTws:o:v:x"
,
[
"help"
,
"key="
,
"text="
,
"HTML"
,
"PDF"
,
"SVG"
,
"TEXT"
,
"word-insertion-mark"
,
"svg="
,
"output="
,
"version="
,
"testing"
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
)
or
not
args
:
usage
()
return
0
elif
opt
in
(
'-v'
,
'--version'
):
if
re
.
match
(
r'^(\d|\d\+|\d\-\d)$'
,
arg
):
stage_version
=
arg
else
:
raise
ValueError
(
'OPTION -v|--version=VERSION does not work with "{}" as value for VERSION!'
.
format
(
arg
))
elif
opt
in
(
'-w'
,
'--word-insertion-mark'
):
show_word_insertion_mark
=
True
elif
opt
in
(
'-P'
,
'--PDF'
):
convert_to_type
=
'PDF'
elif
opt
in
(
'-S'
,
'--SVG'
):
convert_to_type
=
'SVG'
elif
opt
in
(
'-T'
,
'--TEXT'
):
convert_to_type
=
'TEXT'
elif
opt
in
(
'-H'
,
'--HTML'
):
convert_to_type
=
'HTML'
elif
opt
in
(
'-x'
,
'--testing'
):
non_testing
=
False
elif
opt
in
(
'-s'
,
'--svg'
):
svg_file
=
arg
elif
opt
in
(
'-o'
,
'--output'
):
output_file
=
arg
elif
opt
in
(
'-k'
,
'--key'
):
key
=
arg
elif
opt
in
(
'-t'
,
'--text'
):
text
=
arg
print
(
arg
)
if
len
(
args
)
<
1
:
usage
()
return
2
if
convert_to_type
is
None
:
if
output_file
is
not
None
and
len
(
re
.
split
(
r'\.'
,
output_file
))
>
1
:
output_file_part_list
=
re
.
split
(
r'\.'
,
output_file
)
convert_to_type
=
output_file_part_list
[
len
(
output_file_part_list
)
-
1
]
.
upper
()
else
:
convert_to_type
=
'HTML'
exit_code
=
0
for
word_position_file
in
args
:
if
not
isfile
(
word_position_file
):
print
(
"'{}' does not exist!"
.
format
(
word_position_file
))
return
2
if
convert_to_type
==
'PDF'
:
if
output_file
is
None
:
output_file
=
'output.pdf'
highlighted_words
=
None
if
text
is
not
None
:
page
=
Page
(
word_position_file
)
highlighted_words
=
[
word
for
word
in
page
.
words
if
word
.
text
==
text
]
create_pdf_with_highlighted_words
(
word_position_file
,
pdf_file_name
=
output_file
,
highlighted_words
=
highlighted_words
)
else
:
if
svg_file
is
not
None
:
if
isfile
(
svg_file
):
page
=
PageCreator
(
word_position_file
,
svg_file
=
svg_file
)
else
:
print
(
"'{}' does not exist!"
.
format
(
word_position_file
))
return
2
else
:
page
=
Page
(
word_position_file
)
if
page
.
svg_file
is
None
:
print
(
'Please specify a svg file!'
)
usage
()
return
2
highlighted_words
=
None
if
text
is
not
None
:
highlighted_words
=
[
word
for
word
in
page
.
words
if
word
.
text
==
text
]
print
([
(
word
.
id
,
word
.
text
)
for
word
in
highlighted_words
])
converter
=
Converter
.
CREATE_CONVERTER
(
page
,
non_testing
=
non_testing
,
converter_type
=
convert_to_type
,
show_word_insertion_mark
=
show_word_insertion_mark
,
key
=
key
)
exit_code
=
converter
.
convert
(
output_file
=
output_file
,
stage_version
=
stage_version
,
highlighted_words
=
highlighted_words
)
return
exit_code
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment