Page MenuHomec4science

oai_repository_updater.py
No OneTemporary

File Metadata

Created
Thu, Jun 6, 16:14

oai_repository_updater.py

## This file is part of Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""OAI Repository administration tool -
Updates the metadata of the records to include OAI identifiers and
OAI SetSpec according to the settings defined in OAI Repository
admin interface
"""
__revision__ = "$Id$"
import os
import sys
import time
if sys.hexversion < 0x2040000:
# pylint: disable=W0622
from sets import Set as set
# pylint: enable=W0622
from tempfile import mkstemp
from invenio.config import \
CFG_OAI_ID_FIELD, \
CFG_OAI_ID_PREFIX, \
CFG_OAI_SET_FIELD, \
CFG_BINDIR, \
CFG_SITE_NAME, \
CFG_TMPDIR
from invenio.search_engine import \
perform_request_search, \
get_fieldvalues, \
get_record
from invenio.intbitset import intbitset as HitSet
from invenio.dbquery import run_sql
from invenio.bibtask import \
task_get_option, \
task_set_option, \
write_message, \
task_update_progress, \
task_init, \
task_sleep_now_if_required
from invenio.bibrecord import \
record_delete_subfield, \
field_xml_output
DATAFIELD_SET_HEAD = \
"<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\">" % \
(CFG_OAI_SET_FIELD[0:3],
CFG_OAI_SET_FIELD[3:4].replace('_', ' '),
CFG_OAI_SET_FIELD[4:5].replace('_', ' '))
DATAFIELD_ID_HEAD = \
"<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\">" % \
(CFG_OAI_ID_FIELD[0:3],
CFG_OAI_ID_FIELD[3:4].replace('_', ' '),
CFG_OAI_ID_FIELD[4:5].replace('_', ' '))
def get_set_definitions(set_spec):
"""
Retrieve set definitions from oaiREPOSITORY table.
The set definitions are the search patterns that define the records
which are in the set
"""
set_definitions = []
query = "select setName, setDefinition from oaiREPOSITORY where setSpec=%s"
res = run_sql(query, (set_spec, ))
for (set_name, set_definition) in res:
params = parse_set_definition(set_definition)
params['setSpec'] = set_spec
params['setName'] = set_name
set_definitions.append(params)
return set_definitions
def parse_set_definition(set_definition):
"""
Returns the parameters for the given set definition.
The returned structure is a dictionary with keys being
c, p1, f1, m1, p2, f2, m2, p3, f3, m3 and corresponding values
@param set_definition: a string as returned by the database for column 'setDefinition'
@return: a dictionary
"""
params = {'c':'',
'p1':'', 'f1':'', 'm1':'',
'p2':'', 'f2':'', 'm2':'',
'p3':'', 'f3':'', 'm3':'',
'op1':'a', 'op2':'a'}
definitions = set_definition.split(';')
for definition in definitions:
arguments = definition.split('=')
if len(arguments) == 2:
params[arguments[0]] = arguments[1]
return params
def all_set_specs():
"""
Returns the list of (distinct) setSpecs defined in the settings.
This also include the "empty" setSpec if any setting uses it.
Note: there can be several times the same setSpec in the settings,
given that a setSpec might be defined by several search
queries. Here we return distinct values
"""
query = "SELECT DISTINCT setSpec FROM oaiREPOSITORY"
res = run_sql(query)
return [row[0] for row in res]
def get_recids_for_set_spec(set_spec):
"""
Returns the list (as HitSet) of recids belonging to 'set'
Parameters:
set_spec - *str* the set_spec for which we would like to get the
recids
"""
recids = HitSet()
for set_def in get_set_definitions(set_spec):
new_recids = perform_request_search(c=[coll.strip() \
for coll in set_def['c'].split(',')],
p1=set_def['p1'],
f1=set_def['f1'],
m1=set_def['m1'],
op1=set_def['op1'],
p2=set_def['p2'],
f2=set_def['f2'],
m2=set_def['m2'],
op2=set_def['op2'],
p3=set_def['p3'],
f3=set_def['f3'],
m3=set_def['m3'],
ap=0)
recids = recids.union(HitSet(new_recids))
return recids
def get_set_name_for_set_spec(set_spec):
"""
Returns the OAI setName of a setSpec.
Note that the OAI Repository admin lets the user add several set
definition with the same setSpec, and possibly with different
setNames... -> Returns the first (non empty) one found.
Parameters:
set_spec - *str* the set_spec for which we would like to get the
setName
"""
query = "select setName from oaiREPOSITORY where setSpec=%s and setName!=''"
res = run_sql(query, (set_spec, ))
if len(res) > 0:
return res[0][0]
else:
return ""
def print_repository_status(write_message=write_message,
verbose=0):
"""
Prints the repository status to the standard output.
Parameters:
write_message - *function* the function used to write the output
verbose - *int* the verbosity of the output
- 0: print repository size
- 1: print quick status of each set (numbers
can be wrong if the repository is in some
inconsistent state, i.e. a record is in an
OAI setSpec but has not OAI ID)
- 2: print detailed status of repository, with
number of records that needs to be
synchronized according to the sets
definitions. Precise, but ~slow...
"""
repository_size_s = "%d" % repository_size()
repository_recids_after_update = HitSet()
write_message(CFG_SITE_NAME)
write_message(" OAI Repository Status")
set_spec_max_length = 19 # How many max char do we display for
set_name_max_length = 20 # setName and setSpec?
if verbose == 0:
# Just print repository size
write_message(" Total(**)" + " " * 29 +
" " * (9 - len(repository_size_s)) + repository_size_s)
return
elif verbose == 1:
# We display few information: show longer set name and spec
set_spec_max_length = 30
set_name_max_length = 30
write_message("=" * 80)
header = " setSpec" + " " * (set_spec_max_length - 7) + \
" setName" + " " * (set_name_max_length - 5) + " Volume"
if verbose > 1:
header += " " * 5 + "After update(*):"
write_message(header)
if verbose > 1:
write_message(" " * 57 + "Additions Deletions")
write_message("-" * 80)
for set_spec in all_set_specs():
if verbose <= 1:
# Get the records that are in this set. This is an
# incomplete check, as it can happen that some records are
# in this set (according to the metadata) but have no OAI
# ID (so they are not exported). This can happen if the
# repository has some records coming from external
# sources, or if it has never been synchronized with this
# tool.
current_recids = perform_request_search(c=CFG_SITE_NAME,
p1=set_spec,
f1=CFG_OAI_SET_FIELD,
m1="e", ap=0)
nb_current_recids = len(current_recids)
else:
# Get the records that are *currently* exported for this
# setSpec
current_recids = perform_request_search(c=CFG_SITE_NAME,
p1=set_spec,
f1=CFG_OAI_SET_FIELD,
m1="e", ap=0, op1="a",
p2="oai:*",
f2=CFG_OAI_ID_FIELD,
m2="e")
nb_current_recids = len(current_recids)
# Get the records that *should* be in this set according to
# the admin defined settings, and compute how many should be
# added or removed
should_recids = get_recids_for_set_spec(set_spec)
repository_recids_after_update = repository_recids_after_update.union(should_recids)
nb_add_recids = len(HitSet(should_recids).difference(HitSet(current_recids)))
nb_remove_recids = len(HitSet(current_recids).difference(HitSet(should_recids)))
nb_should_recids = len(should_recids)
nb_recids_after_update = len(repository_recids_after_update)
# Adapt setName and setSpec strings lengths
set_spec_str = set_spec
if len(set_spec_str) > set_spec_max_length :
set_spec_str = "%s.." % set_spec_str[:set_spec_max_length]
set_name_str = get_set_name_for_set_spec(set_spec)
if len(set_name_str) > set_name_max_length :
set_name_str = "%s.." % set_name_str[:set_name_max_length]
row = " " + set_spec_str + \
" " * ((set_spec_max_length + 2) - len(set_spec_str)) + set_name_str + \
" " * ((set_name_max_length + 2) - len(set_name_str)) + \
" " * (7 - len(str(nb_current_recids))) + str(nb_current_recids)
if verbose > 1:
row += \
" " * max(9 - len(str(nb_add_recids)), 0) + '+' + str(nb_add_recids) + \
" " * max(7 - len(str(nb_remove_recids)), 0) + '-' + str(nb_remove_recids) + " = " +\
" " * max(7 - len(str(nb_should_recids)), 0) + str(nb_should_recids)
write_message(row)
write_message("=" * 80)
footer = " Total(**)" + " " * (set_spec_max_length + set_name_max_length - 7) + \
" " * (9 - len(repository_size_s)) + repository_size_s
if verbose > 1:
footer += ' ' * (28 - len(str(nb_recids_after_update))) + str(nb_recids_after_update)
write_message(footer)
if verbose > 1:
write_message(' *The "after update" columns show the repository after you run this tool.')
else:
write_message(' *"Volume" is indicative if repository is out of sync. Use --detailed-report.')
write_message('**The "total" is not the sum of the above numbers, but the union of the records.')
def repository_size():
"Read repository size"
return len(perform_request_search(p1="oai:*",
f1=CFG_OAI_ID_FIELD,
m1="e",
ap=0))
### MAIN ###
def oairepositoryupdater_task():
"""Main business logic code of oai_archive"""
no_upload = task_get_option("no_upload")
report = task_get_option("report")
if report > 1:
print_repository_status(verbose=report)
return True
task_update_progress("Fetching records to process")
# Build the list of records to be processed, that is, search for
# the records that match one of the search queries defined in OAI
# Repository admin interface.
recids_for_set = {} # Remember exactly which record belongs to which set
recids = HitSet() # "Flat" set of the recids_for_set values
for set_spec in all_set_specs():
task_sleep_now_if_required(can_stop_too=True)
_recids = get_recids_for_set_spec(set_spec)
recids_for_set[set_spec] = _recids
recids = recids.union(_recids)
# Also get the list of records that are currently exported through
# OAI and that might need to be refreshed
oai_recids = perform_request_search(c=CFG_SITE_NAME,
p1='oai:%s:*' % CFG_OAI_ID_PREFIX,
f1=CFG_OAI_ID_FIELD,
m1="e", ap=0)
recids = recids.union(HitSet(oai_recids))
# Prepare to save results in a tmp file
(fd, filename) = mkstemp(dir=CFG_TMPDIR,
prefix='oairepository_' + \
time.strftime("%Y%m%d_%H%M%S_",
time.localtime()))
oai_out = os.fdopen(fd, "w")
oai_out.write('<collection>')
has_updated_records = False
# Iterate over the recids
i = 0
for recid in recids:
i += 1
task_sleep_now_if_required(can_stop_too=True)
task_update_progress("Done %s out of %s records." % \
(i, len(recids)))
# Check if an OAI identifier is already in the record or
# not.
oai_id_entry = "<subfield code=\"%s\">oai:%s:%s</subfield>\n" % \
(CFG_OAI_ID_FIELD[5:6], CFG_OAI_ID_PREFIX, recid)
already_has_oai_id = True
oai_ids = [_oai_id for _oai_id in \
get_fieldvalues(recid, CFG_OAI_ID_FIELD) \
if _oai_id.strip() != '']
if len(oai_ids) == 0:
already_has_oai_id = False
# Get the sets to which this record already belongs according
# to the metadata
current_oai_sets = set(\
[_oai_set for _oai_set in \
get_fieldvalues(recid, CFG_OAI_SET_FIELD) \
if _oai_set.strip() != ''])
# Get the sets that should be in this record according to
# settings
updated_oai_sets = set(\
[_set for _set, _recids in recids_for_set.iteritems()
if recid in _recids if _set])
# Ok, we have the old sets and the new sets. If they are equal
# and oai ID does not need to be added, then great, nothing to
# change . Otherwise apply the new sets.
if current_oai_sets == updated_oai_sets and already_has_oai_id:
continue # Jump to next recid
has_updated_records = True
# Generate the xml sets entry
oai_set_entry = '\n'.join(["<subfield code=\"%s\">%s</subfield>" % \
(CFG_OAI_SET_FIELD[5:6], _oai_set) \
for _oai_set in updated_oai_sets if \
_oai_set]) + \
"\n"
# Also get all the datafields with tag and indicator matching
# CFG_OAI_SET_FIELD[:5] and CFG_OAI_ID_FIELD[:5] but with
# subcode != CFG_OAI_SET_FIELD[5:6] and subcode !=
# CFG_OAI_SET_FIELD[5:6], so that we can preserve these values
other_data = marcxml_filter_out_tags(recid, [CFG_OAI_SET_FIELD,
CFG_OAI_ID_FIELD])
if CFG_OAI_ID_FIELD[0:5] == CFG_OAI_SET_FIELD[0:5]:
# Put set and OAI ID in the same datafield
oai_out.write("<record>\n")
oai_out.write("<controlfield tag=\"001\">%s"
"</controlfield>\n" % recid)
oai_out.write(DATAFIELD_ID_HEAD)
oai_out.write("\n")
#if oai_id_entry:
oai_out.write(oai_id_entry)
#if oai_set_entry:
oai_out.write(oai_set_entry)
oai_out.write("</datafield>\n")
oai_out.write(other_data)
oai_out.write("</record>\n")
else:
oai_out.write("<record>\n")
oai_out.write("<controlfield tag=\"001\">%s"
"</controlfield>\n" % recid)
oai_out.write(DATAFIELD_ID_HEAD)
oai_out.write("\n")
oai_out.write(oai_id_entry)
oai_out.write("</datafield>\n")
oai_out.write(DATAFIELD_SET_HEAD)
oai_out.write("\n")
oai_out.write(oai_set_entry)
oai_out.write("</datafield>\n")
oai_out.write(other_data)
oai_out.write("</record>\n")
oai_out.write('</collection>')
oai_out.close()
write_message("Wrote to file %s" % filename)
if not no_upload:
task_sleep_now_if_required(can_stop_too=True)
if has_updated_records:
command = "%s/bibupload -c %s -u oairepository" % (CFG_BINDIR, filename)
os.system(command)
else:
os.remove(filename)
return True
def marcxml_filter_out_tags(recid, fields):
"""
Returns the fields of record 'recid' that share the same tag and
indicators as those specified in 'fields', but for which the
subfield is different. This is nice to emulate a bibupload -c that
corrects only specific subfields.
Parameters:
recid - *int* the id of the record to process
fields - *list(str)* the list of fields that we want to filter
out. Eg ['909COp', '909COo']
"""
out = ''
record = get_record(recid)
# Delete subfields that we want to replace
for field in fields:
record_delete_subfield(record,
tag=field[0:3],
ind1=field[3:4],
ind2=field[4:5],
subfield_code=field[5:6])
# Select only datafields that share tag + indicators
processed_tags_and_ind = []
for field in fields:
if not field[0:5] in processed_tags_and_ind:
# Ensure that we do not process twice the same datafields
processed_tags_and_ind.append(field[0:5])
for datafield in record.get(field[0:3], []):
if datafield[1] == field[3:4].replace('_', ' ') and \
datafield[2] == field[4:5].replace('_', ' ') and \
datafield[0]:
out += field_xml_output(datafield, field[0:3]) + '\n'
return out
#########################
def main():
"""Main that construct all the bibtask."""
# if there is any -r or --report option (or other similar options)
# in the arguments, just print the status and exit (do not run
# through BibSched...)
mode = -1
if '-d' in sys.argv[1:] or '--detailed-report' in sys.argv[1:]:
mode = 2
elif '-r' in sys.argv[1:] or '--report' in sys.argv[1:]:
mode = 1
if mode != -1:
def write_message(*args):
"""Overload BibTask function so that it does not need to
run in BibSched environment"""
sys.stdout.write(args[0] + '\n')
print_repository_status(write_message=write_message,
verbose=mode)
return
task_init(authorization_action='runoairepository',
authorization_msg="OAI Archive Task Submission",
description="Examples:\n"
" Expose records according to sets defined in OAI Repository admin interface\n"
" $ oairepositoryupdater \n"
" Expose records according to sets defined in OAI Repository admin interface and update them every day\n"
" $ oairepositoryupdater -s24\n"
" Print OAI repository status\n"
" $ oairepositoryupdater -r\n"
" Print OAI repository detailed status\n"
" $ oairepositoryupdater -d\n\n",
help_specific_usage="Options:\n"
" -r --report\t\tOAI repository status\n"
" -d --detailed-report\t\tOAI repository detailed status\n"
" -n --no-process\tDo no upload the modifications\n",
version=__revision__,
specific_params=("rdn", [
"report",
"detailed-report",
"no-process"]),
task_submit_elaborate_specific_parameter_fnc=
task_submit_elaborate_specific_parameter,
task_run_fnc=oairepositoryupdater_task)
def task_submit_elaborate_specific_parameter(key, value, opts, args):
"""Elaborate specific CLI parameters of oairepositoryupdater"""
if key in ("-r", "--report"):
task_set_option("report", 1)
if key in ("-d", "--detailed-report"):
task_set_option("report", 2)
elif key in ("-n", "--no-process"):
task_set_option("no_upload", 1)
else:
return False
return True
### okay, here we go:
if __name__ == '__main__':
main()

Event Timeline