Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F74008095
utils.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Jul 25, 18:45
Size
18 KB
Mime Type
text/x-python
Expires
Sat, Jul 27, 18:45 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
19311182
Attached To
R3600 invenio-infoscience
utils.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
OAI Harvest utility functions.
"""
__revision__
=
"$Id$"
import
os
import
re
import
time
import
urlparse
import
calendar
from
invenio.ext.logging
import
register_exception
from
invenio.legacy.oaiharvest
import
getter
from
invenio.config
import
(
CFG_SITE_URL
,
CFG_SITE_ADMIN_EMAIL
,
)
from
invenio.legacy.bibrecord
import
(
record_get_field_instances
,
record_modify_subfield
,
field_xml_output
)
from
invenio.utils.shell
import
run_shell_command
from
invenio.utils.text
import
translate_latex2unicode
from
invenio.legacy.oaiharvest.dblayer
import
create_oaiharvest_log_str
from
invenio.legacy.bibcatalog.api
import
BIBCATALOG_SYSTEM
from
invenio.legacy.bibsched.bibtask
import
(
write_message
,
task_low_level_submission
)
from
invenio.modules.workflows.models
import
BibWorkflowEngineLog
## precompile some often-used regexp for speed reasons:
REGEXP_OAI_ID
=
re
.
compile
(
"<identifier.*?>(.*?)<\/identifier>"
,
re
.
DOTALL
)
def
get_nb_records_in_file
(
filename
):
"""
Return number of record in FILENAME that is either harvested or converted
file. Useful for statistics.
:param filename:
"""
try
:
nb
=
open
(
filename
,
'r'
)
.
read
()
.
count
(
"</record>"
)
except
IOError
:
nb
=
0
# file not exists and such
return
nb
def
get_nb_records_in_string
(
string
):
"""
Return number of record in FILENAME that is either harvested or converted
file. Useful for statistics.
:param string:
"""
nb
=
string
.
count
(
"</record>"
)
return
nb
def
create_oaiharvest_log
(
task_id
,
oai_src_id
,
marcxmlfile
):
"""
Function which creates the harvesting logs
:param task_id: bibupload task id
:param oai_src_id:
:param marcxmlfile:
"""
file_fd
=
open
(
marcxmlfile
,
"r"
)
xml_content
=
file_fd
.
read
(
-
1
)
file_fd
.
close
()
create_oaiharvest_log_str
(
task_id
,
oai_src_id
,
xml_content
)
def
collect_identifiers
(
harvested_file_list
):
"""Collects all OAI PMH identifiers from each file in the list
and adds them to a list of identifiers per file.
:param harvested_file_list: list of filepaths to harvested files
:return list of lists, containing each files' identifier list"""
result
=
[]
for
harvested_file
in
harvested_file_list
:
try
:
fd_active
=
open
(
harvested_file
)
except
IOError
as
e
:
raise
e
data
=
fd_active
.
read
()
fd_active
.
close
()
result
.
append
(
REGEXP_OAI_ID
.
findall
(
data
))
return
result
def
find_matching_files
(
basedir
,
filetypes
):
"""
This functions tries to find all files matching given filetypes by
looking at all the files and filenames in the given directory,
including subdirectories.
:param basedir: full path to base directory to search in
:type basedir: string
:param filetypes: list of filetypes, extensions
:type filetypes: list
:return: exitcode and any error messages as: (exitcode, err_msg)
:rtype: tuple
"""
files_list
=
[]
for
dirpath
,
dummy0
,
filenames
in
os
.
walk
(
basedir
):
for
filename
in
filenames
:
full_path
=
os
.
path
.
join
(
dirpath
,
filename
)
dummy1
,
cmd_out
,
dummy2
=
run_shell_command
(
'file
%s
'
,
(
full_path
,)
)
for
filetype
in
filetypes
:
if
cmd_out
.
lower
()
.
find
(
filetype
)
>
-
1
:
files_list
.
append
(
full_path
)
elif
filename
.
split
(
'.'
)[
-
1
]
.
lower
()
==
filetype
:
files_list
.
append
(
full_path
)
return
files_list
def
translate_fieldvalues_from_latex
(
record
,
tag
,
code
=
''
,
encoding
=
'utf-8'
):
"""
Given a record and field tag, this function will modify the record by
translating the subfield values of found fields from LaTeX to chosen
encoding for all the subfields with given code (or all if no code is given).
:param record: record to modify, in BibRec style structure
:type record: dict
:param tag: tag of fields to modify
:type tag: string
:param code: restrict the translation to a given subfield code
:type code: string
:param encoding: scharacter encoding for the new value. Defaults to UTF-8.
:type encoding: string
"""
field_list
=
record_get_field_instances
(
record
,
tag
)
for
field
in
field_list
:
subfields
=
field
[
0
]
subfield_index
=
0
for
subfield_code
,
subfield_value
in
subfields
:
if
code
==
''
or
subfield_code
==
code
:
newvalue
=
translate_latex2unicode
(
subfield_value
)
.
encode
(
encoding
)
record_modify_subfield
(
record
,
tag
,
subfield_code
,
newvalue
,
subfield_index
,
field_position_global
=
field
[
4
])
subfield_index
+=
1
def
compare_timestamps_with_tolerance
(
timestamp1
,
timestamp2
,
tolerance
=
0
):
"""Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form
'2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument
(in seconds). Return -1 if TIMESTAMP1 is less than TIMESTAMP2
minus TOLERANCE, 0 if they are equal within TOLERANCE limit,
and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE.
:param timestamp1:
:param timestamp2:
:param tolerance:
"""
# remove any trailing .00 in timestamps:
timestamp1
=
re
.
sub
(
r'\.[0-9]+$'
,
''
,
timestamp1
)
timestamp2
=
re
.
sub
(
r'\.[0-9]+$'
,
''
,
timestamp2
)
# first convert timestamps to Unix epoch seconds:
timestamp1_seconds
=
calendar
.
timegm
(
time
.
strptime
(
timestamp1
,
"%Y-%m-
%d
%H:%M:%S"
))
timestamp2_seconds
=
calendar
.
timegm
(
time
.
strptime
(
timestamp2
,
"%Y-%m-
%d
%H:%M:%S"
))
# now compare them:
if
timestamp1_seconds
<
timestamp2_seconds
-
tolerance
:
return
-
1
elif
timestamp1_seconds
>
timestamp2_seconds
+
tolerance
:
return
1
else
:
return
0
def
generate_harvest_report
(
workflow
,
current_task_id
=-
1
):
"""
Returns an applicable subject-line + text to send via e-mail or add to
a ticket about the harvesting results.
:param workflow:
:param current_task_id:
:param manual_harvest:
:param error_happened:
"""
from
invenio.modules.oaiharvester.models
import
OaiHARVEST
current_time
=
time
.
strftime
(
"%Y-%m-
%d
%H:%M:%S"
,
time
.
localtime
())
extra_data_workflow
=
workflow
.
get_extra_data
()
list_source
=
""
if
"task_specific_name"
in
extra_data_workflow
[
"options"
]:
fullname
=
str
(
extra_data_workflow
[
"options"
][
"repository"
])
+
extra_data_workflow
[
"options"
][
"task_specific_name"
]
else
:
fullname
=
str
(
extra_data_workflow
[
"options"
][
"repository"
])
try
:
for
i
in
extra_data_workflow
[
"options"
][
"repository"
]:
repository
=
OaiHARVEST
.
query
.
filter
(
OaiHARVEST
.
name
==
i
)
.
one
()
list_source
+=
"
\n
"
+
str
(
repository
.
id
)
+
" "
+
str
(
repository
.
baseurl
)
except
:
list_source
=
"No information"
try
:
if
extra_data_workflow
[
"options"
][
"identifiers"
]:
# One-shot manual harvest
harvesting_prefix
=
"Manual harvest"
else
:
# Automatic
harvesting_prefix
=
"Periodical harvesting"
except
KeyError
:
harvesting_prefix
=
"Periodical harvesting"
subject
=
"
%s
of '
%s
' finished
%s
"
%
(
harvesting_prefix
,
fullname
,
current_time
)
if
workflow
.
counter_error
:
subject
+=
" with errors"
text
=
"""
The %(harvesting)s completed with %(number_errors)d errors at %(ctime)s.
Please forward this mail to administrators. <%(admin_mail)s>
Repositories which have been harvested are :
id base url:
%(list_source)s
"""
%
{
'ctime'
:
current_time
,
'admin_mail'
:
CFG_SITE_ADMIN_EMAIL
,
'harvesting'
:
harvesting_prefix
,
'number_errors'
:
workflow
.
counter_error
,
'list_source'
:
list_source
}
try
:
text
+=
"""
List of OAI IDs harvested:
%(identifiers)s
"""
%
{
'identifiers'
:
str
(
extra_data_workflow
[
"options"
][
"identifiers"
])}
except
KeyError
:
text
+=
"""
No identifiers specified.
"""
workflowlog
=
BibWorkflowEngineLog
.
query
.
filter
(
BibWorkflowEngineLog
.
id_object
==
workflow
.
uuid
)
.
filter
(
BibWorkflowEngineLog
.
log_type
>
10
)
.
all
()
logs
=
""
for
log
in
workflowlog
:
logs
+=
str
(
log
)
+
'
\n
'
text
+=
"""
Logs :
%(logs)s
"""
%
{
'logs'
:
logs
}
return
subject
,
text
def
record_extraction_from_file
(
path
):
"""
Get an harvested file, and transform each record as if
it was another independent harvested document.
:param path: is the path of the file harvested
:return : return a table of records encapsulated with markup of the
document designated by path
*This function much FASTER (3-5 TIMES) than using regex.*
"""
#Will contains all the records
list_of_records
=
[]
temporary_record
=
""
footer
=
""
#will contains the header of the file ie: all lines before the first record
header
=
""
step
=
0
for
line
in
open
(
path
,
'r+'
):
# Extraction of the header
if
step
==
0
:
if
not
line
.
startswith
(
"<record>"
):
header
+=
line
else
:
step
=
1
temporary_record
=
line
elif
step
==
1
:
if
line
.
startswith
(
"</ListRecords>"
)
or
line
.
startswith
(
"</GetRecord>"
):
step
=
2
footer
=
line
elif
line
.
startswith
(
"<record>"
):
temporary_record
=
line
elif
line
.
startswith
(
"</record>"
):
temporary_record
+=
line
list_of_records
.
append
(
temporary_record
)
else
:
temporary_record
+=
line
elif
step
==
2
:
footer
+=
line
#Reassembling of the records and the footer and header
for
i
in
range
(
0
,
len
(
list_of_records
)):
list_of_records
[
i
]
=
header
+
list_of_records
[
i
]
+
footer
return
list_of_records
def
harvest_step
(
obj
,
harvestpath
):
"""
Performs the entire harvesting step.
Returns a tuple of (file_list, error_code)
:param obj:
:param harvestpath:
"""
if
obj
.
extra_data
[
"options"
][
"identifiers"
]:
# Harvesting is done per identifier instead of server-updates
return
harvest_by_identifiers
(
obj
,
harvestpath
)
else
:
return
harvest_by_dates
(
obj
,
harvestpath
)
def
harvest_by_identifiers
(
obj
,
harvestpath
):
"""
Harvest an OAI repository by identifiers.
Given a repository "object" (dict from DB) and a list of OAI identifiers
of records in the repository perform a OAI harvest using GetRecord
for each.
The records will be harvested into the specified filepath.
:param obj:
:param harvestpath:
"""
harvested_files_list
=
[]
for
oai_identifier
in
obj
.
extra_data
[
"options"
][
"identifiers"
]:
harvested_files_list
.
extend
(
oai_harvest_get
(
prefix
=
obj
.
data
[
"metadataprefix"
],
baseurl
=
obj
.
data
[
"baseurl"
],
harvestpath
=
harvestpath
,
verb
=
"GetRecord"
,
identifier
=
oai_identifier
))
return
harvested_files_list
def
call_bibupload
(
marcxmlfile
,
mode
=
None
,
oai_src_id
=-
1
,
sequence_id
=
None
):
"""
Creates a bibupload task for the task scheduler in given mode
on given file. Returns the generated task id and logs the event
in oaiHARVESTLOGS, also adding any given oai source identifier.
:param marcxmlfile: base-marcxmlfilename to upload
:param mode: mode to upload in
:param oai_src_id: id of current source config
:param sequence_id: sequence-number, if relevant
:return: task_id if successful, otherwise None.
"""
if
mode
is
None
:
mode
=
[
"-r"
,
"-i"
]
if
os
.
path
.
exists
(
marcxmlfile
):
try
:
args
=
mode
# Add job with priority 6 (above normal bibedit tasks)
# and file to upload to arguments
args
.
extend
([
"-P"
,
"6"
,
marcxmlfile
])
if
sequence_id
:
args
.
extend
([
'-I'
,
str
(
sequence_id
)])
task_id
=
task_low_level_submission
(
"bibupload"
,
"oaiharvest"
,
*
tuple
(
args
))
create_oaiharvest_log
(
task_id
,
oai_src_id
,
marcxmlfile
)
except
Exception
as
msg
:
write_message
(
"An exception during submitting oaiharvest task occured :
%s
"
%
(
str
(
msg
)))
return
None
return
task_id
else
:
write_message
(
"marcxmlfile
%s
does not exist"
%
(
marcxmlfile
,))
return
None
def
harvest_by_dates
(
obj
,
harvestpath
):
"""
Harvest an OAI repository by dates.
Given a repository "object" (dict from DB) and from/to dates,
this function will perform an OAI harvest request for records
updated between the given dates.
If no dates are given, the repository is harvested from the beginning.
If you set fromdate == last-run and todate == None, then the repository
will be harvested since last time (most common type).
The records will be harvested into the specified filepath.
:param obj:
:param harvestpath:
"""
if
obj
.
extra_data
[
"options"
][
"dates"
]:
fromdate
=
str
(
obj
.
extra_data
[
"options"
][
"dates"
][
0
])
todate
=
str
(
obj
.
extra_data
[
"options"
][
"dates"
][
1
])
elif
obj
.
data
[
"lastrun"
]
is
None
or
obj
.
data
[
"lastrun"
]
==
''
:
fromdate
=
None
todate
=
None
obj
.
extra_data
[
"_should_last_run_be_update"
]
=
True
else
:
fromdate
=
str
(
obj
.
data
[
"lastrun"
])
.
split
()[
0
]
todate
=
None
obj
.
extra_data
[
"_should_last_run_be_update"
]
=
True
return
oai_harvest_get
(
prefix
=
obj
.
data
[
"metadataprefix"
],
baseurl
=
obj
.
data
[
"baseurl"
],
harvestpath
=
harvestpath
,
fro
=
fromdate
,
until
=
todate
,
setspecs
=
obj
.
data
[
"setspecs"
])
def
oai_harvest_get
(
prefix
,
baseurl
,
harvestpath
,
fro
=
None
,
until
=
None
,
setspecs
=
None
,
user
=
None
,
password
=
None
,
cert_file
=
None
,
key_file
=
None
,
method
=
"POST"
,
verb
=
"ListRecords"
,
identifier
=
""
):
"""
Retrieve OAI records from given repository, with given arguments
:param prefix:
:param baseurl:
:param harvestpath:
:param fro:
:param until:
:param setspecs:
:param user:
:param password:
:param cert_file:
:param key_file:
:param method:
:param verb:
:param identifier:
"""
try
:
(
addressing_scheme
,
network_location
,
path
,
dummy1
,
dummy2
,
dummy3
)
=
urlparse
.
urlparse
(
baseurl
)
secure
=
(
addressing_scheme
==
"https"
)
http_param_dict
=
{
'verb'
:
verb
,
'metadataPrefix'
:
prefix
}
if
identifier
:
http_param_dict
[
'identifier'
]
=
identifier
if
fro
:
http_param_dict
[
'from'
]
=
fro
if
until
:
http_param_dict
[
'until'
]
=
until
sets
=
None
if
setspecs
:
sets
=
[
oai_set
.
strip
()
for
oai_set
in
setspecs
.
split
(
' '
)]
harvested_files
=
getter
.
harvest
(
network_location
,
path
,
http_param_dict
,
method
,
harvestpath
,
sets
,
secure
,
user
,
password
,
cert_file
,
key_file
)
return
harvested_files
except
(
StandardError
,
getter
.
InvenioOAIRequestError
)
as
exce
:
register_exception
()
raise
Exception
(
"An error occurred while harvesting from
%s
:
%s
\n
"
%
(
baseurl
,
str
(
exce
)))
def
create_authorlist_ticket
(
matching_fields
,
identifier
,
queue
):
"""
This function will submit a ticket generated by UNDEFINED affiliations
in extracted authors from collaboration authorlists.
:param matching_fields: list of (tag, field_instances) for UNDEFINED nodes
:type matching_fields: list
:param identifier: OAI identifier of record
:type identifier: string
:param queue: the RT queue to send a ticket to
:type queue: string
:return: return the ID of the created ticket, or None on failure
:rtype: int or None
"""
subject
=
"[OAI Harvest] UNDEFINED affiliations for record
%s
"
%
(
identifier
,)
text
=
"""
Harvested record with identifier %(ident)s has had its authorlist extracted and contains some UNDEFINED affiliations.
To see the record, go here: %(baseurl)s/search?p=%(ident)s
If the record is not there yet, try again later. It may take some time for it to load into the system.
List of unidentified fields:
%(fields)s
"""
%
{
'ident'
:
identifier
,
'baseurl'
:
CFG_SITE_URL
,
'fields'
:
"
\n
"
.
join
([
field_xml_output
(
field
,
tag
)
for
tag
,
field_instances
in
matching_fields
for
field
in
field_instances
])
}
return
create_ticket
(
queue
,
subject
,
text
)
def
create_ticket
(
queue
,
subject
,
text
=
""
):
"""
This function will submit a ticket using the configured BibCatalog system.
:param queue: the ticketing queue to send a ticket to
:type queue: string
:param subject: subject of the ticket
:type subject: string
:param text: the main text or body of the ticket. Optional.
:type text: string
:return: return the ID of the created ticket, or None on failure
:rtype: int or None
"""
# Initialize BibCatalog connection as default user, if possible
if
bibcatalog_system
is
not
None
:
bibcatalog_response
=
bibcatalog_system
.
check_system
()
else
:
bibcatalog_response
=
"No ticket system configured"
if
bibcatalog_response
!=
""
:
write_message
(
"BibCatalog error:
%s
\n
"
%
(
bibcatalog_response
,))
return
None
ticketid
=
bibcatalog_system
.
ticket_submit
(
subject
=
subject
,
queue
=
queue
)
if
text
:
comment
=
bibcatalog_system
.
ticket_comment
(
None
,
ticketid
,
text
)
if
comment
is
None
:
write_message
(
"Error: commenting on ticket
%s
failed."
%
(
str
(
ticketid
),))
return
ticketid
Event Timeline
Log In to Comment