invenio_connector.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Jun 27, 19:08

invenio_connector.py
View Options

	# -- coding: utf-8 --
	#
	# This file is part of Invenio.
	# Copyright (C) 2010, 2011, 2013 CERN.
	#
	# Invenio is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License as
	# published by the Free Software Foundation; either version 2 of the
	# License, or (at your option) any later version.
	#
	# Invenio is distributed in the hope that it will be useful, but
	# WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with Invenio; if not, write to the Free Software Foundation, Inc.,
	# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""
	Tools to connect to distant Invenio servers using Invenio APIs.

	Example of use:

	from InvenioConnector import *
	cds = InvenioConnector("http://cds.cern.ch")

	results = cds.search("higgs")

	for record in results:
	print record["245__a"][0]
	print record["520__b"][0]
	for author in record["100__"]:
	print author["a"][0], author["u"][0]

	FIXME:
	- implement cache expiration
	- exceptions handling
	- parsing of <!-- Search-Engine-Total-Number-Of-Results: N -->
	- better checking of input parameters (especially InvenioConnector.__init__ "url")
	- improve behaviour when running locally (perform_request_search requiring "req" object)
	"""
	import urllib
	import urllib2
	import xml.sax
	import re
	import tempfile
	import os
	import time
	import sys

	MECHANIZE_CLIENTFORM_VERSION_CHANGE = (0, 2, 0)
	try:
	import mechanize
	if mechanize.__version__ < MECHANIZE_CLIENTFORM_VERSION_CHANGE:
	OLD_MECHANIZE_VERSION = True
	import ClientForm
	else:
	OLD_MECHANIZE_VERSION = False
	MECHANIZE_AVAILABLE = True
	except ImportError:
	MECHANIZE_AVAILABLE = False

	try:
	# if we are running locally, we can optimize :-)
	from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE
	from invenio.bibtask import task_low_level_submission
	from invenio.search_engine import perform_request_search, collection_restricted_p
	from invenio.bibformat import format_records
	from invenio.urlutils import make_user_agent_string
	LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL]
	CFG_USER_AGENT = make_user_agent_string("invenio_connector")
	except ImportError:
	LOCAL_SITE_URLS = None
	CFG_CERN_SITE = 0
	CFG_USER_AGENT = "invenio_connector"

	CFG_CDS_URL = "http://cds.cern.ch/"

	class InvenioConnectorAuthError(Exception):
	"""
	This exception is called by InvenioConnector when authentication fails during
	remote or local connections.
	"""
	def __init__(self, value):
	"""
	Set the internal "value" attribute to that of the passed "value" parameter.
	@param value: an error string to display to the user.
	@type value: string
	"""
	Exception.__init__(self)
	self.value = value
	def __str__(self):
	"""
	Return oneself as a string (actually, return the contents of self.value).
	@return: representation of error
	@rtype: string
	"""
	return str(self.value)

	class InvenioConnectorServerError(Exception):
	"""
	This exception is called by InvenioConnector when using it on a machine with no
	Invenio installed and no remote server URL is given during instantiation.
	"""
	def __init__(self, value):
	"""
	Set the internal "value" attribute to that of the passed "value" parameter.
	@param value: an error string to display to the user.
	@type value: string
	"""
	Exception.__init__(self)
	self.value = value
	def __str__(self):
	"""
	Return oneself as a string (actually, return the contents of self.value).
	@return: representation of error
	@rtype: string
	"""
	return str(self.value)

	class InvenioConnector(object):
	"""
	Creates an connector to a server running Invenio
	"""

	def __init__(self, url=None, user="", password="", login_method="Local",
	local_import_path="invenio", insecure_login=False):
	"""
	Initialize a new instance of the server at given URL.

	If the server happens to be running on the local machine, the
	access will be done directly using the Python APIs. In that case
	you can choose from which base path to import the necessary file
	specifying the local_import_path parameter.

	@param url: the url to which this instance will be connected.
	Defaults to CFG_SITE_URL, if available.
	@type url: string
	@param user: the optional username for interacting with the Invenio
	instance in an authenticated way.
	@type user: string
	@param password: the corresponding password.
	@type password: string
	@param login_method: the name of the login method the Invenio instance
	is expecting for this user (in case there is more than one).
	@type login_method: string
	@param local_import_path: the base path from which the connector should
	try to load the local connector, if available. Eg "invenio" will
	lead to "import invenio.dbquery"
	@type local_import_path: string

	@raise InvenioConnectorAuthError: if no secure URL is given for authentication
	@raise InvenioConnectorServerError: if no URL is given on a machine without Invenio installed
	"""
	if url == None and LOCAL_SITE_URLS != None:
	self.server_url = LOCAL_SITE_URLS[0] # Default to CFG_SITE_URL
	elif url == None:
	raise InvenioConnectorServerError("You do not seem to have Invenio installed and no remote URL is given")
	else:
	self.server_url = url
	self.local = LOCAL_SITE_URLS and self.server_url in LOCAL_SITE_URLS
	self.cached_queries = {}
	self.cached_records = {}
	self.cached_baskets = {}
	self.user = user
	self.password = password
	self.login_method = login_method
	self.browser = None
	if self.user:
	if not insecure_login and not self.server_url.startswith('https://'):
	raise InvenioConnectorAuthError("You have to use a secure URL (HTTPS) to login")
	if MECHANIZE_AVAILABLE:
	self._init_browser()
	self._check_credentials()
	else:
	self.user = None
	raise InvenioConnectorAuthError("The Python module Mechanize (and ClientForm" \
	" if Mechanize version < 0.2.0) must" \
	" be installed to perform authenticated requests.")

	def _init_browser(self):
	"""
	Ovveride this method with the appropriate way to prepare a logged in
	browser.
	"""
	self.browser = mechanize.Browser()
	self.browser.set_handle_robots(False)
	self.browser.open(self.server_url + "/youraccount/login")
	self.browser.select_form(nr=0)
	self.browser['p_un'] = self.user
	self.browser['p_pw'] = self.password
	# Set login_method to be writable
	self.browser.form.find_control('login_method').readonly = False
	self.browser['login_method'] = self.login_method
	self.browser.submit()

	def _check_credentials(self):
	if not 'youraccount/logout' in self.browser.response().read():
	raise InvenioConnectorAuthError("It was not possible to successfully login with the provided credentials")

	def search(self, read_cache=True, **kwparams):
	"""
	Returns records corresponding to the given search query.

	See docstring of invenio.search_engine.perform_request_search()
	for an overview of available parameters.

	@raise InvenioConnectorAuthError: if authentication fails
	"""
	parse_results = False
	of = kwparams.get('of', "")
	if of == "":
	parse_results = True
	of = "xm"
	kwparams['of'] = of
	params = urllib.urlencode(kwparams, doseq=1)

	# Are we running locally? If so, better directly access the
	# search engine directly
	if self.local and of != 't':
	# See if user tries to search any restricted collection
	c = kwparams.get('c', "")
	if c != "":
	if type(c) is list:
	colls = c
	else:
	colls = [c]
	for collection in colls:
	if collection_restricted_p(collection):
	if self.user:
	self._check_credentials()
	continue
	raise InvenioConnectorAuthError("You are trying to search a restricted collection. Please authenticate yourself.\n")
	kwparams['of'] = 'id'
	results = perform_request_search(**kwparams)
	if of.lower() != 'id':
	results = format_records(results, of)
	else:
	if not self.cached_queries.has_key(params + str(parse_results)) or not read_cache:
	if self.user:
	results = self.browser.open(self.server_url + "/search?" + params)
	else:
	results = urllib2.urlopen(self.server_url + "/search?" + params)
	if 'youraccount/login' in results.geturl():
	# Current user not able to search collection
	raise InvenioConnectorAuthError("You are trying to search a restricted collection. Please authenticate yourself.\n")
	else:
	return self.cached_queries[params + str(parse_results)]

	if parse_results:
	# FIXME: we should not try to parse if results is string
	parsed_records = self._parse_results(results, self.cached_records)
	self.cached_queries[params + str(parse_results)] = parsed_records
	return parsed_records
	else:
	# pylint: disable=E1103
	# The whole point of the following code is to make sure we can
	# handle two types of variable.
	try:
	res = results.read()
	except AttributeError:
	res = results
	# pylint: enable=E1103

	if of == "id":
	try:
	if type(res) is str:
	# Transform to list
	res = [int(recid.strip()) for recid in \
	res.strip("[]").split(",") if recid.strip() != ""]
	res.reverse()
	except (ValueError, AttributeError):
	res = []
	self.cached_queries[params + str(parse_results)] = res
	return self.cached_queries[params + str(parse_results)]

	def search_with_retry(self, sleeptime=3.0, retrycount=3, **params):
	"""
	This function performs a search given a dictionary of search(..)
	parameters. It accounts for server timeouts as necessary and
	will retry some number of times.

	@param sleeptime: number of seconds to sleep between retries
	@type sleeptime: float

	@param retrycount: number of times to retry given search
	@type retrycount: int

	@param params: search parameters
	@type params: **kwds

	@rtype: list
	@return: returns records in given format
	"""
	results = []
	count = 0
	while count < retrycount:
	try:
	results = self.search(**params)
	break
	except urllib2.URLError:
	sys.stderr.write("Timeout while searching...Retrying\n")
	time.sleep(sleeptime)
	count += 1
	else:
	sys.stderr.write("Aborting search after %d attempts.\n" % (retrycount,))
	return results

	def search_similar_records(self, recid):
	"""
	Returns the records similar to the given one
	"""
	return self.search(p="recid:" + str(recid), rm="wrd")

	def search_records_cited_by(self, recid):
	"""
	Returns records cited by the given one
	"""
	return self.search(p="recid:" + str(recid), rm="citation")

	def get_records_from_basket(self, bskid, group_basket=False, read_cache=True):
	"""
	Returns the records from the (public) basket with given bskid
	"""
	if not self.cached_baskets.has_key(bskid) or not read_cache:
	if self.user:
	if group_basket:
	group_basket = '&category=G'
	else:
	group_basket = ''
	results = self.browser.open(self.server_url + \
	"/yourbaskets/display?of=xm&bskid=" + str(bskid) + group_basket)
	else:
	results = urllib2.urlopen(self.server_url + \
	"/yourbaskets/display_public?of=xm&bskid=" + str(bskid))
	else:
	return self.cached_baskets[bskid]

	parsed_records = self._parse_results(results, self.cached_records)
	self.cached_baskets[bskid] = parsed_records
	return parsed_records

	def get_record(self, recid, read_cache=True):
	"""
	Returns the record with given recid
	"""
	if self.cached_records.has_key(recid) or not read_cache:
	return self.cached_records[recid]
	else:
	return self.search(p="recid:" + str(recid))

	def upload_marcxml(self, marcxml, mode):
	"""
	Uploads a record to the server

	Parameters:
	marcxml - str the XML to upload.
	mode - str the mode to use for the upload.
	"-i" insert new records
	"-r" replace existing records
	"-c" correct fields of records
	"-a" append fields to records
	"-ir" insert record or replace if it exists
	"""
	if mode not in ["-i", "-r", "-c", "-a", "-ir"]:
	raise NameError, "Incorrect mode " + str(mode)

	# Are we running locally? If so, submit directly
	if self.local:
	(code, marcxml_filepath) = tempfile.mkstemp(prefix="upload_%s" % \
	time.strftime("%Y%m%d_%H%M%S_",
	time.localtime()))
	marcxml_file_d = os.fdopen(code, "w")
	marcxml_file_d.write(marcxml)
	marcxml_file_d.close()
	return task_low_level_submission("bibupload", "", mode, marcxml_filepath)
	else:
	params = urllib.urlencode({'file': marcxml,
	'mode': mode})
	## We don't use self.browser as batchuploader is protected by IP
	opener = urllib2.build_opener()
	opener.addheaders = [('User-Agent', CFG_USER_AGENT)]
	return opener.open(self.server_url + "/batchuploader/robotupload", params,)

	def _parse_results(self, results, cached_records):
	"""
	Parses the given results (in MARCXML format).

	The given "cached_records" list is a pool of
	already existing parsed records (in order to
	avoid keeping several times the same records in memory)
	"""
	parser = xml.sax.make_parser()
	handler = RecordsHandler(cached_records)
	parser.setContentHandler(handler)
	parser.parse(results)
	return handler.records

	class Record(dict):
	"""
	Represents a Invenio record
	"""
	def __init__(self, recid=None, marcxml=None, server_url=None):
	#dict.__init__(self)
	self.recid = recid
	self.marcxml = ""
	if marcxml is not None:
	self.marcxml = marcxml
	#self.record = {}
	self.server_url = server_url

	def __setitem__(self, item, value):
	tag, ind1, ind2, subcode = decompose_code(item)
	if subcode is not None:
	#if not dict.has_key(self, tag + ind1 + ind2):
	# dict.__setitem__(self, tag + ind1 + ind2, [])
	dict.__setitem__(self, tag + ind1 + ind2, [{subcode: [value]}])
	else:
	dict.__setitem__(self, tag + ind1 + ind2, value)

	def __getitem__(self, item):
	tag, ind1, ind2, subcode = decompose_code(item)

	datafields = dict.__getitem__(self, tag + ind1 + ind2)
	if subcode is not None:
	subfields = []
	for datafield in datafields:
	if datafield.has_key(subcode):
	subfields.extend(datafield[subcode])
	return subfields
	else:
	return datafields

	def __contains__(self, item):
	return dict.__contains__(item)

	def __repr__(self):
	return "Record(" + dict.__repr__(self) + ")"

	def __str__(self):
	return self.marcxml

	def export(self, of="marcxml"):
	"""
	Returns the record in chosen format
	"""
	return self.marcxml

	def url(self):
	"""
	Returns the URL to this record.
	Returns None if not known
	"""
	if self.server_url is not None and \
	self.recid is not None:
	return self.server_url + "/"+ CFG_SITE_RECORD +"/" + str(self.recid)
	else:
	return None
	if MECHANIZE_AVAILABLE:
	class _SGMLParserFactory(mechanize.DefaultFactory):
	"""
	Black magic to be able to interact with CERN SSO forms.
	"""
	def __init__(self, i_want_broken_xhtml_support=False):
	if OLD_MECHANIZE_VERSION:
	forms_factory = mechanize.FormsFactory(
	form_parser_class=ClientForm.XHTMLCompatibleFormParser)
	else:
	forms_factory = mechanize.FormsFactory(
	form_parser_class=mechanize.XHTMLCompatibleFormParser)
	mechanize.Factory.__init__(
	self,
	forms_factory=forms_factory,
	links_factory=mechanize.LinksFactory(),
	title_factory=mechanize.TitleFactory(),
	response_type_finder=mechanize._html.ResponseTypeFinder(
	allow_xhtml=i_want_broken_xhtml_support),
	)

	class CDSInvenioConnector(InvenioConnector):
	def __init__(self, user="", password="", local_import_path="invenio"):
	"""
	This is a specialized InvenioConnector class suitable to connect
	to the CERN Document Server (CDS), which uses centralized SSO.
	"""
	cds_url = CFG_CDS_URL
	if user:
	cds_url = cds_url.replace('http', 'https')
	super(CDSInvenioConnector, self).__init__(cds_url, user, password, local_import_path=local_import_path)

	def _init_browser(self):
	"""
	@note: update this everytime the CERN SSO login form is refactored.
	"""
	self.browser = mechanize.Browser(factory=_SGMLParserFactory(i_want_broken_xhtml_support=True))
	self.browser.set_handle_robots(False)
	self.browser.open(self.server_url)
	self.browser.follow_link(text_regex="Sign in")
	self.browser.select_form(nr=0)
	self.browser.form['ctl00$ctl00$NICEMasterPageBodyContent$SiteContentPlaceholder$txtFormsLogin'] = self.user
	self.browser.form['ctl00$ctl00$NICEMasterPageBodyContent$SiteContentPlaceholder$txtFormsPassword'] = self.password
	self.browser.submit()
	self.browser.select_form(nr=0)
	self.browser.submit()

	class RecordsHandler(xml.sax.handler.ContentHandler):
	"MARCXML Parser"

	def __init__(self, records):
	"""
	Parameters:
	records - dict a dictionary with an already existing cache of records
	"""
	self.cached_records = records
	self.records = []
	self.in_record = False
	self.in_controlfield = False
	self.in_datafield = False
	self.in_subfield = False
	self.cur_tag = None
	self.cur_subfield = None
	self.cur_controlfield = None
	self.cur_datafield = None
	self.cur_record = None
	self.recid = 0
	self.buffer = ""
	self.counts = 0

	def startElement(self, name, attributes):
	if name == "record":
	self.cur_record = Record()
	self.in_record = True

	elif name == "controlfield":
	tag = attributes["tag"]
	self.cur_datafield = ""
	self.cur_tag = tag
	self.cur_controlfield = []
	if not self.cur_record.has_key(tag):
	self.cur_record[tag] = self.cur_controlfield
	self.in_controlfield = True

	elif name == "datafield":
	tag = attributes["tag"]
	self.cur_tag = tag
	ind1 = attributes["ind1"]
	if ind1 == " ": ind1 = "_"
	ind2 = attributes["ind2"]
	if ind2 == " ": ind2 = "_"
	if not self.cur_record.has_key(tag + ind1 + ind2):
	self.cur_record[tag + ind1 + ind2] = []
	self.cur_datafield = {}
	self.cur_record[tag + ind1 + ind2].append(self.cur_datafield)
	self.in_datafield = True

	elif name == "subfield":
	subcode = attributes["code"]
	if not self.cur_datafield.has_key(subcode):
	self.cur_subfield = []
	self.cur_datafield[subcode] = self.cur_subfield
	else:
	self.cur_subfield = self.cur_datafield[subcode]
	self.in_subfield = True

	def characters(self, data):
	if self.in_subfield:
	self.buffer += data
	elif self.in_controlfield:
	self.buffer += data
	elif "Search-Engine-Total-Number-Of-Results:" in data:
	print data
	match_obj = re.search("\d+", data)
	if match_obj:
	print int(match_obj.group())
	self.counts = int(match_obj.group())

	def endElement(self, name):
	if name == "record":
	self.in_record = False
	elif name == "controlfield":
	if self.cur_tag == "001":
	self.recid = int(self.buffer)
	if self.cached_records.has_key(self.recid):
	# Record has already been parsed, no need to add
	pass
	else:
	# Add record to the global cache
	self.cached_records[self.recid] = self.cur_record
	# Add record to the ordered list of results
	self.records.append(self.cached_records[self.recid])

	self.cur_controlfield.append(self.buffer)
	self.in_controlfield = False
	self.buffer = ""
	elif name == "datafield":
	self.in_datafield = False
	elif name == "subfield":
	self.in_subfield = False
	self.cur_subfield.append(self.buffer)
	self.buffer = ""


	def decompose_code(code):
	"""
	Decomposes a MARC "code" into tag, ind1, ind2, subcode
	"""
	code = "%-6s" % code
	ind1 = code[3:4]
	if ind1 == " ": ind1 = "_"
	ind2 = code[4:5]
	if ind2 == " ": ind2 = "_"
	subcode = code[5:6]
	if subcode == " ": subcode = None
	return (code[0:3], ind1, ind2, subcode)

invenio_connector.pyNo OneTemporaryActions

File Metadata

invenio_connector.pyView Options

Event Timeline

invenio_connector.py
No OneTemporary
Actions

invenio_connector.py
View Options