Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F60579633
create_task.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, May 1, 05:51
Size
13 KB
Mime Type
text/x-python
Expires
Fri, May 3, 05:51 (2 d)
Engine
blob
Format
Raw Data
Handle
17344897
Attached To
rNIETZSCHEPYTHON nietzsche-python
create_task.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to create a task.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import
abc
from
colorama
import
Fore
,
Style
import
getopt
import
inspect
import
itertools
import
lxml.etree
as
ET
import
re
import
shutil
import
sys
import
os
from
os
import
listdir
,
sep
,
makedirs
from
os.path
import
exists
,
isfile
,
isdir
,
dirname
,
basename
,
splitext
if
dirname
(
__file__
)
not
in
sys
.
path
:
sys
.
path
.
append
(
dirname
(
__file__
))
from
convert_wordPositions
import
create_pdf_with_highlighted_words
,
create_svg_with_highlighted_words
from
util
import
copy_xml_file_word_pos_only
,
get_mismatching_ids
from
datatypes.page
import
Page
from
datatypes.faksimile
import
FaksimilePage
#from join_faksimileAndTranskription import STATUS_MERGED_OK
from
util
import
ExternalViewer
,
create_highlighted_svg_file
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
UNITTESTING
=
False
HIGHLIGHT_COLOR
=
'red'
OPACITY
=
'0.5'
class
Task
(
metaclass
=
abc
.
ABCMeta
):
"""This abstract class can be used to create a task.
"""
finish_dir
=
'Fertig'
def
__init__
(
self
,
xml_source_file
,
target_dir
,
page
=
None
,
faksimile_svgFile
=
None
,
dirname
=
None
,
description
=
''
,
edit_transkription
=
False
,
edit_xml
=
False
,
manual
=
None
,
status_contains
=
''
,
bg_color
=
HIGHLIGHT_COLOR
,
opacity
=
OPACITY
):
self
.
xml_source_file
=
xml_source_file
self
.
page
=
page
if
self
.
page
is
None
:
self
.
page
=
Page
(
self
.
xml_source_file
)
self
.
faksimile_svgFile
=
faksimile_svgFile
self
.
target_dir
=
target_dir
+
sep
+
dirname
\
if
dirname
is
not
None
else
target_dir
self
.
dirname
=
dirname
self
.
description
=
description
if
description
!=
''
else
self
.
__doc__
self
.
edit_transkription
=
edit_transkription
self
.
edit_xml
=
edit_xml
self
.
manual
=
manual
self
.
status_contains
=
status_contains
self
.
bg_color
=
bg_color
self
.
opacity
=
opacity
self
.
created_files
=
[]
self
.
finished_files
=
[]
if
isdir
(
self
.
target_dir
):
self
.
created_files
=
[
created_file
for
created_file
in
listdir
(
self
.
target_dir
)
if
not
isdir
(
created_file
)
]
if
isdir
(
self
.
target_dir
+
sep
+
self
.
finish_dir
):
self
.
finished_files
=
listdir
(
self
.
target_dir
+
sep
+
self
.
finish_dir
)
def
create
(
self
):
makedirs
(
self
.
target_dir
+
sep
+
Task
.
finish_dir
,
exist_ok
=
True
)
if
self
.
manual
is
not
None
and
isfile
(
self
.
manual
):
shutil
.
copy
(
self
.
manual
,
self
.
target_dir
)
words
=
self
.
select_words
(
self
.
page
.
words
)
if
not
self
.
edit_transkription
:
transkription_file
=
self
.
target_dir
+
sep
+
self
.
create_file_name
(
self
.
page
,
is_faksimile_svg
=
False
,
suffix
=
'.pdf'
)
create_pdf_with_highlighted_words
(
page
=
self
.
page
,
highlighted_words
=
words
,
\
pdf_file_name
=
transkription_file
,
bg_color
=
self
.
bg_color
)
else
:
transkription_file
=
self
.
target_dir
+
sep
+
self
.
create_file_name
(
self
.
page
,
is_faksimile_svg
=
False
)
create_svg_with_highlighted_words
(
page
=
self
.
page
,
highlighted_words
=
words
,
\
svg_file_name
=
transkription_file
,
bg_color
=
self
.
bg_color
)
if
self
.
edit_xml
:
xml_file
=
copy_xml_file_word_pos_only
(
self
.
page
.
page_tree
.
docinfo
.
URL
,
self
.
target_dir
)
self
.
created_files
.
append
(
xml_file
)
note
=
self
.
create_note_about_missing_words
()
if
note
!=
''
:
note_file
=
self
.
target_dir
+
sep
+
self
.
create_file_name
(
self
.
page
,
is_faksimile_svg
=
False
,
suffix
=
'.txt'
)
with
open
(
note_file
,
'w+'
)
as
f
:
f
.
write
(
note
)
f
.
close
()
if
isfile
(
transkription_file
):
self
.
created_files
.
append
(
transkription_file
)
source_svg_file
=
self
.
page
.
faksimile_svgFile
if
self
.
page
.
faksimile_svgFile
is
not
None
\
else
self
.
faksimile_svgFile
if
source_svg_file
is
None
:
raise
Exception
(
'source_svg_file not specified: neither page nor self have a faksimile_svgFile!'
)
svg_file
=
self
.
target_dir
+
sep
+
self
.
create_file_name
(
self
.
page
)
\
if
self
.
page
.
title
!=
''
and
self
.
page
.
number
!=
-
1
\
else
self
.
target_dir
+
sep
+
basename
(
source_svg_file
)
faksimile_tree
=
ET
.
parse
(
source_svg_file
)
node_ids
=
self
.
get_node_ids
()
create_highlighted_svg_file
(
faksimile_tree
,
node_ids
,
target_file
=
svg_file
,
\
highlight_color
=
self
.
bg_color
,
opacity
=
self
.
opacity
)
if
isfile
(
svg_file
):
self
.
created_files
.
append
(
svg_file
)
def
create_file_name
(
self
,
page
,
suffix
=
'.svg'
,
is_faksimile_svg
=
True
):
"""Return a file name for page.
"""
if
is_faksimile_svg
:
return
page
.
title
.
replace
(
' '
,
'-'
)
+
',{}.svg'
.
format
(
str
(
page
.
number
))
else
:
return
basename
(
page
.
page_tree
.
docinfo
.
URL
)
.
replace
(
'.xml'
,
suffix
)
def
create_note_about_missing_words
(
self
):
"""Create a note about missing words for faksimile and transkription ids.
"""
return
''
def
contains_file
(
self
,
file_name
,
is_finished
=
False
):
"""Return whether task created a file with basename file_name.
"""
if
is_finished
:
return
len
([
finished_file
for
finished_file
in
self
.
finished_files
if
basename
(
finished_file
)
==
basename
(
file_name
)
])
>
0
return
len
([
created_file
for
created_file
in
self
.
created_files
if
basename
(
created_file
)
==
basename
(
file_name
)
])
>
0
def
get_fullpath4file
(
self
,
file_name
):
"""Return full path for created file with file_name.
"""
if
not
self
.
contains_file
(
file_name
):
return
None
return
[
created_file
for
created_file
in
self
.
created_files
if
basename
(
created_file
)
==
basename
(
file_name
)
][
0
]
@abc.abstractmethod
def
get_node_ids
(
self
):
"""Return node ids for faksimile svg rect.
"""
pass
def
has_been_created
(
self
,
page
):
"""Return true if task has been created.
"""
faksimile_svg
=
self
.
create_file_name
(
page
)
transkription_svg
=
self
.
create_file_name
(
page
,
is_faksimile_svg
=
False
)
xml_file
=
self
.
create_file_name
(
page
,
is_faksimile_svg
=
False
,
suffix
=
'.xml'
)
return
self
.
contains_file
(
faksimile_svg
)
\
or
self
.
contains_file
(
transkription_svg
)
\
or
self
.
contains_file
(
xml_file
)
\
or
self
.
has_been_finished
(
page
,
faksimile_svg
=
faksimile_svg
,
\
transkription_svg
=
transkription_svg
,
xml_file
=
xml_file
)
def
has_been_finished
(
self
,
page
,
faksimile_svg
=
None
,
transkription_svg
=
None
,
xml_file
=
None
):
"""Return true if task has been finished.
"""
if
faksimile_svg
is
None
:
faksimile_svg
=
self
.
create_file_name
(
page
)
if
transkription_svg
is
None
:
transkription_svg
=
self
.
create_file_name
(
page
,
is_faksimile_svg
=
False
)
if
xml_file
is
None
:
xml_file
=
self
.
create_file_name
(
page
,
is_faksimile_svg
=
False
,
suffix
=
'.xml'
)
return
self
.
contains_file
(
faksimile_svg
,
is_finished
=
True
)
\
or
self
.
contains_file
(
transkription_svg
,
is_finished
=
True
)
\
or
self
.
contains_file
(
xml_file
,
is_finished
=
True
)
@abc.abstractmethod
def
select_words
(
self
,
words
):
"""Returns selected words.
"""
pass
class
SplitFaksimileWordBoxes
(
Task
):
"""Split faksimile word boxes according to how many boxes a word has on the transkription.
TODO
"""
def
__init__
(
self
,
xml_source_file
,
target_dir
):
super
(
SplitFaksimileWordBoxes
,
self
)
.
__int__
(
xml_source_file
,
target_dir
,
\
status_contains
=
STATUS_MERGED_OK
)
def
select_words
(
self
,
words
):
"""Returns selected words. TODO
"""
#TODO create those functions!!!!
#return [ word for word in words if word.hasParts() and word.partsMissFaksimilePostion() ]
return
words
class
CorrectWords
(
Task
):
"""Correct words from faksimile and from transkription such that they correspond.
"""
def
__init__
(
self
,
xml_source_file
,
source_svg_file
,
target_dir
,
page
=
None
,
unmatched_node_ids
=
None
,
edit_xml
=
True
):
super
(
CorrectWords
,
self
)
.
__init__
(
xml_source_file
,
target_dir
,
page
=
page
,
faksimile_svgFile
=
source_svg_file
,
\
edit_transkription
=
True
,
edit_xml
=
edit_xml
)
self
.
unmatched_words
=
[]
self
.
unmatched_faksimile_positions
=
[]
self
.
unmatched_node_ids
=
unmatched_node_ids
if
unmatched_node_ids
is
not
None
else
[]
if
self
.
page
is
None
:
self
.
page
=
Page
(
self
.
xml_source_file
)
self
.
init_unmatched_words
()
def
init_unmatched_words
(
self
):
"""Init unmatched ids.
"""
source_svg_file
=
self
.
page
.
faksimile_svgFile
if
self
.
page
.
faksimile_svgFile
is
not
None
\
else
self
.
faksimile_svgFile
faksimile_tree
=
ET
.
parse
(
source_svg_file
)
faksimile_page
=
FaksimilePage
.
GET_FAKSIMILEPAGES
(
faksimile_tree
,
page_number
=
str
(
self
.
page
.
number
))[
0
]
self
.
unmatched_words
,
self
.
unmatched_faksimile_positions
=
get_mismatching_ids
(
self
.
page
.
words
,
faksimile_page
.
word_positions
)
def
create_note_about_missing_words
(
self
):
"""Create a note about missing words for faksimile and transkription ids.
"""
note
=
'{0},{1}: nicht übereinstimmende Wörter.
\n
'
.
format
(
self
.
page
.
title
,
str
(
self
.
page
.
number
))
if
len
(
self
.
unmatched_words
)
>
0
:
note
+=
'
\n
Folgende Transkription-Wörter haben keine Entsprechung bei den Wörtern auf dem Faksimile:
\n
'
for
word
in
self
.
unmatched_words
:
note
+=
'- "{0}", id="{1}", line_number: {2}
\n
'
.
format
(
word
.
text
,
word
.
id
,
word
.
line_number
)
if
len
(
self
.
unmatched_faksimile_positions
)
>
0
:
note
+=
'
\n
Folgende Faksimile-Wörter haben keine Entsprechung bei den Wörtern der Transkription:
\n
'
for
faksimile_position
in
self
.
unmatched_faksimile_positions
:
note
+=
'- "{0}", id: {1}
\n
'
.
format
(
faksimile_position
.
text
,
faksimile_position
.
id
)
return
note
def
get_target_filepath
(
self
,
page
,
is_faksimile_svg
=
True
,
suffix
=
'.svg'
,
is_finished
=
False
):
"""Return target filepath for page.
"""
if
is_finished
:
return
self
.
target_dir
+
sep
+
self
.
finish_dir
+
sep
+
self
.
create_file_name
(
page
,
is_faksimile_svg
=
is_faksimile_svg
,
suffix
=
suffix
)
return
self
.
target_dir
+
sep
+
self
.
create_file_name
(
page
,
is_faksimile_svg
=
is_faksimile_svg
,
suffix
=
suffix
)
def
get_node_ids
(
self
):
"""Return node ids for faksimile svg rect.
"""
return
self
.
unmatched_node_ids
def
select_words
(
self
,
words
):
"""Return words that match unmatched_word_ids.
"""
if
len
(
self
.
unmatched_words
)
==
0
:
return
words
return
self
.
unmatched_words
def
usage
(
func_name
):
"""prints information on how to use the script
"""
print
(
func_name
.
__doc__
)
def
main_correct_words
(
argv
):
"""This program can be used to create the task 'CorrectWords' in directory ./correct-words.
svgscripts/copy_faksimile_svg_file.py [OPTIONS] <xml_source_file>
<xml_source_file>
OPTIONS:
-h|--help: show help
-r|--refdir=dir reference directory
:return: exit code (int)
"""
tmp_dir
=
'./correct-words'
ref_dir
=
None
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hr:"
,
[
"help"
,
"refdir="
])
except
getopt
.
GetoptError
:
usage
(
eval
(
inspect
.
currentframe
()
.
f_code
.
co_name
))
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
(
eval
(
inspect
.
currentframe
()
.
f_code
.
co_name
))
return
0
elif
opt
in
(
'-r'
,
'--refdir'
):
ref_dir
=
arg
if
len
(
args
)
<
1
:
usage
(
eval
(
inspect
.
currentframe
()
.
f_code
.
co_name
))
return
2
exit_status
=
0
for
xml_source_file
in
args
:
if
isfile
(
xml_source_file
):
page
=
Page
(
xml_source_file
)
if
ref_dir
is
not
None
and
isdir
(
ref_dir
)
\
and
isfile
(
ref_dir
+
sep
+
basename
(
xml_source_file
)):
ref_page
=
Page
(
ref_dir
+
sep
+
basename
(
xml_source_file
))
page
.
words
=
ref_page
.
words
if
page
.
faksimile_svgFile
is
not
None
\
and
isfile
(
page
.
faksimile_svgFile
):
correct_words
=
CorrectWords
(
xml_source_file
,
page
.
faksimile_svgFile
,
tmp_dir
,
page
=
page
)
for
faksimile_position
in
correct_words
.
unmatched_faksimile_positions
:
correct_words
.
unmatched_node_ids
.
append
(
faksimile_position
.
id
)
correct_words
.
create
()
else
:
print
(
'Skipping {0}. File does not contain a valid faksimile_svgFile reference!'
.
format
(
xml_source_file
))
return
exit_status
def
main
(
argv
):
return
main_correct_words
(
argv
)
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment