Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F84512952
extractAndConvert.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Sep 23, 08:25
Size
5 KB
Mime Type
text/x-python
Expires
Wed, Sep 25, 08:25 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
20948181
Attached To
rNIETZSCHEPYTHON nietzsche-python
extractAndConvert.py
View Options
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" This program can be used to start extraction of word position and showing the word positions to HTML for testing purposes.
"""
# Copyright (C) University of Basel 2019 {{{1
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/> 1}}}
import
getopt
import
re
import
sys
from
os
import
sep
,
path
from
os.path
import
isfile
import
lxml.etree
as
ET
from
extractWordPosition
import
Extractor
from
convert_wordPositions
import
HTMLConverter
sys
.
path
.
append
(
'shared_util'
)
from
myxmlwriter
import
write_pretty
__author__
=
"Christian Steiner"
__maintainer__
=
__author__
__copyright__
=
'University of Basel'
__email__
=
"christian.steiner@unibas.ch"
__status__
=
"Development"
__license__
=
"GPL v3"
__version__
=
"0.0.1"
def
usage
():
"""prints information on how to use the script
"""
print
(
main
.
__doc__
)
def
main
(
argv
):
"""This program can be used to start extraction of word position and showing the word positions to HTML for testing purposes.
svgscripts/extractAndConvert.py [OPTIONS] <file>
<file> svg file OR xml target file containing file name of svg file as "/page/@source".
OPTIONS:
-h|--help: show help
-s|--svg=svgFile: svg web file
-H|--HTML [default] convert to HTML test file
-x|--xml-target-file=xmlOutputFile: target file
-p|--page=pageNumber: page number of the current page. For use with _one_ file only.
-P|--PDF=pdfFile: pdf file - used for word correction
-t|--title=title: title of the manuscript to which the current page(s) belong(s)
:return: exit code (int)
"""
convert_to_type
=
'HTML'
file_name
=
None
non_testing
=
True
page
=
None
page_number
=
None
pdfFile
=
None
svg_file
=
None
title
=
None
xml_dir
=
".{}xml"
.
format
(
sep
)
xml_target_file
=
None
try
:
opts
,
args
=
getopt
.
getopt
(
argv
,
"hTHt:p:s:x:P:"
,
[
"help"
,
"Testing"
,
"HTML"
,
"title="
,
"page="
,
"svg="
,
"xml-target-file="
,
"PDF="
])
except
getopt
.
GetoptError
:
usage
()
return
2
for
opt
,
arg
in
opts
:
if
opt
in
(
'-h'
,
'--help'
):
usage
()
return
0
elif
opt
in
(
'-T'
,
'--Testing'
):
non_testing
=
False
elif
opt
in
(
'-t'
,
'--title'
):
title
=
arg
elif
opt
in
(
'-p'
,
'--page'
):
page_number
=
str
(
arg
)
elif
opt
in
(
'-s'
,
'--svg'
):
svg_file
=
arg
elif
opt
in
(
'-P'
,
'--PDF'
):
pdfFile
=
arg
elif
opt
in
(
'-x'
,
'--xml-target-file'
):
xml_target_file
=
str
(
arg
)
if
len
(
args
)
<
1
or
args
[
0
]
.
endswith
(
'xml'
):
if
xml_target_file
is
None
:
xml_target_file
=
args
[
0
]
if
len
(
args
)
>
0
else
None
if
xml_target_file
is
not
None
and
isfile
(
xml_target_file
):
target_file_tree
=
ET
.
parse
(
xml_target_file
)
file_name
=
target_file_tree
.
getroot
()
.
get
(
'source'
)
title
=
target_file_tree
.
getroot
()
.
get
(
'title'
)
if
title
is
None
else
title
page_number
=
target_file_tree
.
getroot
()
.
get
(
'number'
)
if
page_number
is
None
else
page_number
if
svg_file
is
None
:
if
len
(
target_file_tree
.
xpath
(
'//svg-image'
))
>
0
:
svg_file
=
target_file_tree
.
xpath
(
'.//svg-image/@file-name'
)[
0
]
\
if
len
(
target_file_tree
.
xpath
(
'.//svg-image/@file-name'
))
>
0
else
None
else
:
svg_file
=
target_file_tree
.
xpath
(
'.//svg/@file'
)[
0
]
\
if
len
(
target_file_tree
.
xpath
(
'.//svg/@file'
))
>
0
else
None
else
:
file_name
=
args
[
0
]
if
file_name
is
None
or
not
isfile
(
file_name
):
print
(
"'{}' does not exist!"
.
format
(
file_name
))
if
(
file_name
is
not
None
)
else
usage
()
return
2
extractor
=
Extractor
(
xml_dir
=
xml_dir
,
title
=
title
)
page
=
extractor
.
extract_information
(
file_name
,
page_number
=
page_number
,
xml_target_file
=
xml_target_file
,
pdfFile
=
pdfFile
,
svg_file
=
svg_file
)
if
page
.
svg_file
is
None
:
print
(
'Please specify a svg file!'
)
usage
()
return
2
converter
=
HTMLConverter
(
page
,
non_testing
=
non_testing
)
converter
.
convert
()
if
xml_target_file
is
not
None
:
xml_target_file
=
xml_dir
+
sep
+
path
.
basename
(
xml_target_file
)
page
.
page_tree
.
getroot
()
.
set
(
'source'
,
file_name
)
write_pretty
(
xml_element_tree
=
page
.
page_tree
,
file_name
=
xml_target_file
,
script_name
=
__file__
,
file_type
=
'svgWordPosition'
)
return
0
if
__name__
==
"__main__"
:
sys
.
exit
(
main
(
sys
.
argv
[
1
:]))
Event Timeline
Log In to Comment