Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F109610328
plotextractor_output_utils.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, Apr 22, 16:14
Size
21 KB
Mime Type
text/x-python
Expires
Thu, Apr 24, 16:14 (2 d)
Engine
blob
Format
Raw Data
Handle
25737135
Attached To
R3600 invenio-infoscience
plotextractor_output_utils.py
View Options
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
import
os
import
re
import
sys
from
invenio.config
import
CFG_TMPDIR
from
invenio.textutils
import
encode_for_xml
from
invenio.bibrecord
import
field_xml_output
DUMMY_IMAGE_TMP
=
os
.
path
.
join
(
CFG_TMPDIR
,
'plotextractor_dummy.png'
)
def
write_message
(
message
):
print
message
def
write_messages
(
messages
):
for
message
in
messages
:
write_message
(
message
)
def
find_open_and_close_braces
(
line_index
,
start
,
brace
,
lines
):
"""
Take the line where we want to start and the index where we want to start
and find the first instance of matched open and close braces of the same
type as brace in file file.
@param: line (int): the index of the line we want to start searching at
@param: start (int): the index in the line we want to start searching at
@param: brace (string): one of the type of brace we are looking for ({, },
[, or ])
@param lines ([string, string, ...]): the array of lines in the file we
are looking in.
@return: (start, start_line, end, end_line): (int, int, int): the index
of the start and end of whatever braces we are looking for, and the
line number that the end is on (since it may be different than the line
we started on)
"""
if
brace
in
[
'['
,
']'
]:
open_brace
=
'['
close_brace
=
']'
elif
brace
in
[
'{'
,
'}'
]:
open_brace
=
'{'
close_brace
=
'}'
elif
brace
in
[
'('
,
')'
]:
open_brace
=
'('
close_brace
=
')'
else
:
# unacceptable brace type!
return
(
-
1
,
-
1
,
-
1
,
-
1
)
open_braces
=
[]
line
=
lines
[
line_index
]
ret_open_index
=
line
.
find
(
open_brace
,
start
)
line_index_cpy
=
line_index
# sometimes people don't put the braces on the same line
# as the tag
while
ret_open_index
==
-
1
:
line_index
=
line_index
+
1
if
line_index
>=
len
(
lines
):
# failed to find open braces...
return
(
0
,
line_index_cpy
,
0
,
line_index_cpy
)
line
=
lines
[
line_index
]
ret_open_index
=
line
.
find
(
open_brace
)
open_braces
.
append
(
open_brace
)
ret_open_line
=
line_index
open_index
=
ret_open_index
close_index
=
ret_open_index
while
len
(
open_braces
)
>
0
:
if
open_index
==
-
1
and
close_index
==
-
1
:
# we hit the end of the line! oh, noez!
line_index
=
line_index
+
1
if
line_index
>=
len
(
lines
):
# hanging braces!
return
(
ret_open_index
,
ret_open_line
,
ret_open_index
,
\
ret_open_line
)
line
=
lines
[
line_index
]
# to not skip things that are at the beginning of the line
close_index
=
line
.
find
(
close_brace
)
open_index
=
line
.
find
(
open_brace
)
else
:
if
close_index
!=
-
1
:
close_index
=
line
.
find
(
close_brace
,
close_index
+
1
)
if
open_index
!=
-
1
:
open_index
=
line
.
find
(
open_brace
,
open_index
+
1
)
if
close_index
!=
-
1
:
open_braces
.
pop
()
if
len
(
open_braces
)
==
0
and
\
(
open_index
>
close_index
or
open_index
==
-
1
):
break
if
open_index
!=
-
1
:
open_braces
.
append
(
open_brace
)
ret_close_index
=
close_index
return
(
ret_open_index
,
ret_open_line
,
ret_close_index
,
line_index
)
def
assemble_caption
(
begin_line
,
begin_index
,
end_line
,
end_index
,
lines
):
"""
Take write_messageation about the caption of a picture and put it all together
in a nice way. If it spans multiple lines, put it on one line. If it
contains controlled characters, strip them out. If it has tags we don't
want to worry about, get rid of them, etc.
@param: begin_line (int): the index of the line where the caption begins
@param: begin_index (int): the index within the line where the caption
begins
@param: end_line (int): the index of the line where the caption ends
@param: end_index (int): the index within the line where the caption ends
@param: lines ([string, string, ...]): the line strings of the text
@return: caption (string): the caption, nicely formatted and pieced together
"""
# stuff we don't like
label_head
=
'
\\
label{'
# reassemble that sucker
if
end_line
>
begin_line
:
# our caption spanned multiple lines
caption
=
lines
[
begin_line
][
begin_index
:]
for
included_line_index
in
range
(
begin_line
+
1
,
end_line
):
caption
=
caption
+
' '
+
lines
[
included_line_index
]
caption
=
caption
+
' '
+
lines
[
end_line
][:
end_index
]
caption
=
caption
.
replace
(
'
\n
'
,
' '
)
caption
=
caption
.
replace
(
' '
,
' '
)
else
:
# it fit on one line
caption
=
lines
[
begin_line
][
begin_index
:
end_index
]
# clean out a label tag, if there is one
label_begin
=
caption
.
find
(
label_head
)
if
label_begin
>
-
1
:
# we know that our caption is only one line, so if there's a label
# tag in it, it will be all on one line. so we make up some args
dummy_start
,
dummy_start_line
,
label_end
,
dummy_end
=
\
find_open_and_close_braces
(
0
,
label_begin
,
'{'
,
[
caption
])
caption
=
caption
[:
label_begin
]
+
caption
[
label_end
+
1
:]
# clean out characters not allowed in MARCXML
# not allowed: & < >
try
:
caption
=
encode_for_xml
(
caption
.
encode
(
'utf-8'
,
'xmlcharrefreplace'
),
wash
=
True
)
except
:
# that damn encode thing threw an error on astro-ph/0601014
sys
.
stderr
.
write
(
caption
)
sys
.
stderr
.
write
(
' cannot be processed
\n
'
)
caption
=
caption
.
replace
(
'&'
,
'&'
)
.
replace
(
'<'
,
'<'
)
caption
=
caption
.
replace
(
'>'
,
'>'
)
caption
=
caption
.
strip
()
if
len
(
caption
)
>
1
and
caption
[
0
]
==
'{'
and
caption
[
-
1
]
==
'}'
:
caption
=
caption
[
1
:
-
1
]
return
caption
def
prepare_image_data
(
extracted_image_data
,
tex_file
,
image_list
):
"""
Prepare and clean image-data from duplicates and other garbage.
@param: extracted_image_data ([(string, string, list, list) ...],
...])): the images and their captions + contexts, ordered
@param: tex_file (string): the location of the TeX (used for finding the
associated images; the TeX is assumed to be in the same directory
as the converted images)
@param: image_list ([string, string, ...]): a list of the converted
image file names
@return extracted_image_data ([(string, string, list, list) ...],
...])) again the list of image data cleaned for output
"""
sdir
=
os
.
path
.
split
(
tex_file
)[
0
]
image_locs_and_captions_and_labels
=
[]
for
(
image
,
caption
,
label
)
in
extracted_image_data
:
if
image
==
'ERROR'
:
continue
if
not
image
==
''
:
image_loc
=
get_image_location
(
image
,
sdir
,
image_list
)
if
image_loc
!=
None
and
os
.
path
.
exists
(
image_loc
):
image_locs_and_captions_and_labels
.
append
(
(
image_loc
,
caption
,
label
))
else
:
image_locs_and_captions_and_labels
.
append
((
image
,
caption
,
label
))
return
image_locs_and_captions_and_labels
def
remove_dups
(
extracted_image_data
):
"""
So now that we spam and get loads and loads of stuff in our lists, we need
to intelligently get rid of some of it.
@param: extracted_image_data ([(string, string, list, list),
(string, string, list, list),...]): the full list of images, captions,
labels and contexts extracted from this file
@return: extracted_image_data ([(string, string, list, list),
(string, string, list, list),...)]: the same list, but if there are
duplicate images contained in it, then their captions are condensed
"""
img_list
=
{}
pared_image_data
=
[]
# combine relevant captions
for
(
image
,
caption
,
label
,
contexts
)
in
extracted_image_data
:
if
image
in
img_list
:
if
not
caption
in
img_list
[
image
]:
img_list
[
image
]
.
append
(
caption
)
else
:
img_list
[
image
]
=
[
caption
]
# order it (we know that the order in the original is correct)
for
(
image
,
caption
,
label
,
contexts
)
in
extracted_image_data
:
if
image
in
img_list
:
pared_image_data
.
append
((
image
,
\
' : '
.
join
(
img_list
[
image
]),
label
,
contexts
))
del
img_list
[
image
]
# else we already added it to the new list
return
pared_image_data
def
create_contextfiles
(
extracted_image_data
):
"""
Saves the context for each image to a file in the current sub-directory,
returning a list of tuples per file saved in this form: [(image, filename), ..]
@param extracted_image_data ([(string, string, list, list), ...]):
a list of tuples of images matched to labels, captions and contexts from
this document.
"""
for
image
,
dummy2
,
dummy3
,
contexts
in
extracted_image_data
:
if
len
(
contexts
)
>
0
and
image
!=
""
:
context_filepath
=
image
+
'.context'
fd
=
open
(
context_filepath
,
'w'
)
for
context_line
in
contexts
:
fd
.
write
(
context_line
+
'
\n\n
'
)
fd
.
close
()
#write_message(context_filepath + ' written.')
def
create_MARC
(
extracted_image_data
,
tarball
,
refno
):
"""
Take the images and their captions and the name of the associated TeX
file and build a MARCXML record for them.
@param: extracted_image_data ([(string, string, list, list), ...]):
a list of tuples of images matched to labels, captions and contexts from
this document.
@param: refno (string): the name for the record number field, or None
@output: a MARCXML record detailing all the arguments as appropriate
at tarball.insert.xml and a duplicate one at tarball.correct.xml
@return: the path to the MARCXML record, None if no plots
"""
root_dir
=
os
.
path
.
dirname
(
tarball
)
+
os
.
sep
+
os
.
path
.
basename
(
tarball
)
+
\
'_plots'
+
os
.
sep
# For building result MARCXML
marcxml
=
[
'<record>'
]
# Datafield := (subfields, ind1, ind2, controlfield)
# Subfield := (code, value)
#FIXME: Determine what to do without refno
if
refno
and
refno
.
isdigit
():
field
=
(
None
,
' '
,
' '
,
refno
)
marcxml
.
append
(
field_xml_output
(
field
,
'001'
))
index
=
0
for
(
image_location
,
caption
,
dummy
,
contexts
)
in
extracted_image_data
:
if
image_location
==
''
:
# we don't know the image, but the captions are for separate things
for
cap
in
caption
.
split
(
' : '
):
# Add DUMMY-PLOT MARCXML per loose captions
subfields
=
[]
subfields
.
append
((
'a'
,
DUMMY_IMAGE_TMP
))
subfields
.
append
((
't'
,
"PlotMisc"
))
subfields
.
append
((
'd'
,
"
%05d
%s
"
%
(
index
,
cap
)))
subfields
.
append
((
'n'
,
"fig
%05d
"
%
(
index
,)))
subfields
.
append
((
'o'
,
"HIDDEN"
))
marcxml
.
append
(
field_xml_output
((
subfields
,
' '
,
' '
,
None
),
"FFT"
))
index
=
index
+
1
else
:
# Merge subfolder into docname, until root directory
relative_image_path
=
image_location
.
replace
(
root_dir
,
''
)
docname
=
"_"
.
join
(
relative_image_path
.
split
(
'.'
)[:
-
1
])
.
replace
(
'/'
,
'_'
)
if
len
(
caption
)
<
3
:
subfields
=
[]
subfields
.
append
((
'a'
,
image_location
))
subfields
.
append
((
't'
,
"PlotMisc"
))
subfields
.
append
((
'd'
,
"
%05d
%s
"
%
(
index
,
caption
.
replace
(
' : '
,
''
))))
subfields
.
append
((
'n'
,
docname
))
subfields
.
append
((
'o'
,
"HIDDEN"
))
marcxml
.
append
(
field_xml_output
((
subfields
,
' '
,
' '
,
None
),
"FFT"
))
else
:
# Add PLOT MARCXML
subfields
=
[]
subfields
.
append
((
'a'
,
image_location
))
subfields
.
append
((
't'
,
"Plot"
))
subfields
.
append
((
'd'
,
"
%05d
%s
"
%
(
index
,
caption
.
replace
(
' : '
,
''
))))
subfields
.
append
((
'n'
,
docname
))
marcxml
.
append
(
field_xml_output
((
subfields
,
' '
,
' '
,
None
),
"FFT"
))
if
contexts
:
# Add CONTEXT MARCXML
subfields
=
[]
subfields
.
append
((
'a'
,
"
%s
.context"
%
(
image_location
,)))
subfields
.
append
((
't'
,
"Plot"
))
subfields
.
append
((
'f'
,
".png;context"
))
subfields
.
append
((
'n'
,
docname
))
subfields
.
append
((
'o'
,
"HIDDEN"
))
marcxml
.
append
(
field_xml_output
((
subfields
,
' '
,
' '
,
None
),
"FFT"
))
index
=
index
+
1
marcxml
.
append
(
'</record>'
)
return
'
\n
'
.
join
(
marcxml
)
def
get_image_location
(
image
,
sdir
,
image_list
,
recurred
=
False
):
"""
This function takes a raw image name and a directory and returns the location of the
(possibly converted) image
@param: image (string): the name of the raw image from the TeX
@param: sdir (string): the directory where everything was unzipped to
@param: image_list ([string, string, ...]): the list of images that
were extracted from the tarball and possibly converted
@return: converted_image (string): the full path to the (possibly
converted) image file
"""
if
type
(
image
)
==
list
:
# image is a list, not good
return
None
image
=
str
(
image
)
image
=
image
.
strip
()
figure_or_file
=
'(figure=|file=)'
figure_or_file_in_image
=
re
.
findall
(
figure_or_file
,
image
)
if
len
(
figure_or_file_in_image
)
>
0
:
image
.
replace
(
figure_or_file_in_image
[
0
],
''
)
includegraphics
=
'
\\
includegraphics{'
includegraphics_in_image
=
re
.
findall
(
includegraphics
,
image
)
if
len
(
includegraphics_in_image
)
>
0
:
image
.
replace
(
includegraphics_in_image
[
0
],
''
)
image
=
image
.
strip
()
some_kind_of_tag
=
'
\\\\\\
w+ '
if
image
.
startswith
(
'./'
):
image
=
image
[
2
:]
if
re
.
match
(
some_kind_of_tag
,
image
):
image
=
image
[
len
(
image
.
split
(
' '
)[
0
])
+
1
:]
if
image
.
startswith
(
'='
):
image
=
image
[
1
:]
if
len
(
image
)
==
1
:
return
None
image
=
image
.
strip
()
image_path
=
os
.
path
.
join
(
sdir
,
image
)
converted_image_should_be
=
get_converted_image_name
(
image_path
)
if
image_list
==
None
:
image_list
=
os
.
listdir
(
sdir
)
for
png_image
in
image_list
:
if
converted_image_should_be
==
png_image
:
return
png_image
# maybe it's in a subfolder called eps (TeX just understands that)
if
os
.
path
.
isdir
(
os
.
path
.
join
(
sdir
,
'eps'
)):
image_list
=
os
.
listdir
(
os
.
path
.
join
(
sdir
,
'eps'
))
for
png_image
in
image_list
:
if
converted_image_should_be
==
png_image
:
return
os
.
path
.
join
(
'eps'
,
png_image
)
if
os
.
path
.
isdir
(
os
.
path
.
join
(
sdir
,
'fig'
)):
image_list
=
os
.
listdir
(
os
.
path
.
join
(
sdir
,
'fig'
))
for
png_image
in
image_list
:
if
converted_image_should_be
==
png_image
:
return
os
.
path
.
join
(
'fig'
,
png_image
)
if
os
.
path
.
isdir
(
os
.
path
.
join
(
sdir
,
'figs'
)):
image_list
=
os
.
listdir
(
os
.
path
.
join
(
sdir
,
'figs'
))
for
png_image
in
image_list
:
if
converted_image_should_be
==
png_image
:
return
os
.
path
.
join
(
'figs'
,
png_image
)
if
os
.
path
.
isdir
(
os
.
path
.
join
(
sdir
,
'Figures'
)):
image_list
=
os
.
listdir
(
os
.
path
.
join
(
sdir
,
'Figures'
))
for
png_image
in
image_list
:
if
converted_image_should_be
==
png_image
:
return
os
.
path
.
join
(
'Figures'
,
png_image
)
if
os
.
path
.
isdir
(
os
.
path
.
join
(
sdir
,
'Figs'
)):
image_list
=
os
.
listdir
(
os
.
path
.
join
(
sdir
,
'Figs'
))
for
png_image
in
image_list
:
if
converted_image_should_be
==
png_image
:
return
os
.
path
.
join
(
'Figs'
,
png_image
)
# maybe it is actually just loose.
for
png_image
in
os
.
listdir
(
sdir
):
if
os
.
path
.
split
(
converted_image_should_be
)[
-
1
]
==
png_image
:
return
converted_image_should_be
if
os
.
path
.
isdir
(
os
.
path
.
join
(
sdir
,
png_image
)):
# try that, too! we just do two levels, because that's all that's
# reasonable..
sub_dir
=
os
.
path
.
join
(
sdir
,
png_image
)
for
sub_dir_file
in
os
.
listdir
(
sub_dir
):
if
os
.
path
.
split
(
converted_image_should_be
)[
-
1
]
==
sub_dir_file
:
return
converted_image_should_be
# maybe it's actually up a directory or two: this happens in nested
# tarballs where the TeX is stored in a different directory from the images
for
png_image
in
os
.
listdir
(
os
.
path
.
split
(
sdir
)[
0
]):
if
os
.
path
.
split
(
converted_image_should_be
)[
-
1
]
==
png_image
:
return
converted_image_should_be
for
png_image
in
os
.
listdir
(
os
.
path
.
split
(
os
.
path
.
split
(
sdir
)[
0
])[
0
]):
if
os
.
path
.
split
(
converted_image_should_be
)[
-
1
]
==
png_image
:
return
converted_image_should_be
if
recurred
:
return
None
# agh, this calls for drastic measures
for
piece
in
image
.
split
(
' '
):
res
=
get_image_location
(
piece
,
sdir
,
image_list
,
recurred
=
True
)
if
res
!=
None
:
return
res
for
piece
in
image
.
split
(
','
):
res
=
get_image_location
(
piece
,
sdir
,
image_list
,
recurred
=
True
)
if
res
!=
None
:
return
res
for
piece
in
image
.
split
(
'='
):
res
=
get_image_location
(
piece
,
sdir
,
image_list
,
recurred
=
True
)
if
res
!=
None
:
return
res
#write_message('Unknown image ' + image)
return
None
def
get_converted_image_name
(
image
):
"""
Gives the name of the image after it has been converted to png format.
Strips off the old extension.
@param: image (string): The fullpath of the image before conversion
@return: converted_image (string): the fullpath of the image after convert
"""
png_extension
=
'.png'
if
image
[(
0
-
len
(
png_extension
)):]
==
png_extension
:
# it already ends in png! we're golden
return
image
img_dir
=
os
.
path
.
split
(
image
)[
0
]
image
=
os
.
path
.
split
(
image
)[
-
1
]
# cut off the old extension
if
len
(
image
.
split
(
'.'
))
>
1
:
old_extension
=
'.'
+
image
.
split
(
'.'
)[
-
1
]
converted_image
=
image
[:(
0
-
len
(
old_extension
))]
+
png_extension
else
:
#no extension... damn
converted_image
=
image
+
png_extension
return
os
.
path
.
join
(
img_dir
,
converted_image
)
def
get_tex_location
(
new_tex_name
,
current_tex_name
,
recurred
=
False
):
"""
Takes the name of a TeX file and attempts to match it to an actual file
in the tarball.
@param: new_tex_name (string): the name of the TeX file to find
@param: current_tex_name (string): the location of the TeX file where we
found the reference
@return: tex_location (string): the location of the other TeX file on
disk or None if it is not found
"""
tex_location
=
None
current_dir
=
os
.
path
.
split
(
current_tex_name
)[
0
]
some_kind_of_tag
=
'
\\\\\\
w+ '
new_tex_name
=
new_tex_name
.
strip
()
if
new_tex_name
.
startswith
(
'input'
):
new_tex_name
=
new_tex_name
[
len
(
'input'
):]
if
re
.
match
(
some_kind_of_tag
,
new_tex_name
):
new_tex_name
=
new_tex_name
[
len
(
new_tex_name
.
split
(
' '
)[
0
])
+
1
:]
if
new_tex_name
.
startswith
(
'./'
):
new_tex_name
=
new_tex_name
[
2
:]
if
len
(
new_tex_name
)
==
0
:
#write_message('TeX has been stripped down to nothing.')
return
None
new_tex_name
=
new_tex_name
.
strip
()
new_tex_file
=
os
.
path
.
split
(
new_tex_name
)[
-
1
]
new_tex_folder
=
os
.
path
.
split
(
new_tex_name
)[
0
]
if
new_tex_folder
==
new_tex_file
:
new_tex_folder
=
''
# could be in the current directory
for
any_file
in
os
.
listdir
(
current_dir
):
if
any_file
==
new_tex_file
:
return
os
.
path
.
join
(
current_dir
,
new_tex_file
)
# could be in a subfolder of the current directory
if
os
.
path
.
isdir
(
os
.
path
.
join
(
current_dir
,
new_tex_folder
)):
for
any_file
in
os
.
listdir
(
os
.
path
.
join
(
current_dir
,
new_tex_folder
)):
if
any_file
==
new_tex_file
:
return
os
.
path
.
join
(
os
.
path
.
join
(
current_dir
,
new_tex_folder
),
new_tex_file
)
# could be in a subfolder of a higher directory
one_dir_up
=
os
.
path
.
join
(
os
.
path
.
split
(
current_dir
)[
0
],
new_tex_folder
)
if
os
.
path
.
isdir
(
one_dir_up
):
for
any_file
in
os
.
listdir
(
one_dir_up
):
if
any_file
==
new_tex_file
:
return
os
.
path
.
join
(
one_dir_up
,
new_tex_file
)
two_dirs_up
=
os
.
path
.
join
(
os
.
path
.
split
(
os
.
path
.
split
(
current_dir
)[
0
])[
0
],
new_tex_folder
)
if
os
.
path
.
isdir
(
two_dirs_up
):
for
any_file
in
os
.
listdir
(
two_dirs_up
):
if
any_file
==
new_tex_file
:
return
os
.
path
.
join
(
two_dirs_up
,
new_tex_file
)
if
tex_location
==
None
and
not
recurred
:
return
get_tex_location
(
new_tex_name
+
'.tex'
,
current_tex_name
,
\
recurred
=
True
)
return
tex_location
Event Timeline
Log In to Comment