sherpa_romeo.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Oct 14, 09:29

sherpa_romeo.py
View Options

	# -- coding: utf-8 --
	#
	# This file is part of Invenio.
	# Copyright (C) 2012, 2013 CERN.
	#
	# Invenio is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License as
	# published by the Free Software Foundation; either version 2 of the
	# License, or (at your option) any later version.
	#
	# Invenio is distributed in the hope that it will be useful, but
	# WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with Invenio; if not, write to the Free Software Foundation, Inc.,
	# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	from invenio.utils.xmlDict import XmlDictConfig, ElementTree
	import urllib2
	from werkzeug.contrib.cache import RedisCache
	from invenio.ext.cache import cache


	class SherpaRomeoSearch(object):
	"""
	SHERPA/RoMEO API wrapper class to search Publishers and Journals
	Queries the SHERPA/RoMEO API and parses the xml returned.

	The search functions return the associated names.
	(search_issn returns journal)

	It uses Redis to cache all xml from queries and create a small Journal
	and Publisher db.

	For more detailed results the SherpaRomeoSearch.parser API must be used
	that gives access to conditions, issn and single items on exact matches


	@seealso: http://www.sherpa.ac.uk/romeo/api.html
	"""

	def __init__(self):
	self.parser = SherpaRomeoXMLParser()
	self.error = False
	self.error_message = ""

	def search_publisher(self, query):
	"""
	Search for Publishers
	query: the query to be made

	returns a list with publisher names
	"""

	# Search first for exact matches in cache
	cached_publisher = cache.get("publisher:" + query.lower())
	if cached_publisher is not None:
	self.parser.set_single_item(publisher=cached_publisher,)
	return cached_publisher['name']

	cleanquery = query.replace(" ", "+")
	url = "http://www.sherpa.ac.uk/romeo/api29.php?pub=" + cleanquery
	self.parser.parse_url(url)
	self.error = self.parser.error
	self.error_message = self.parser.error_message
	if not self.error:
	return self.parser.get_publishers(attribute='name')

	def search_journal(self, query, query_type='contains'):
	"""
	Search for Journals
	query: the query to be made
	query_type: it must be 'contains'(default), 'exact' or 'start'

	returns a list with the specific journal titles or empty list
	"""

	if query_type is 'exact':
	# Search first for exact matches in cache
	cached_journal = cache.get("journal:" + query.lower())
	if cached_journal is not None:
	self.parser.set_single_item(journal=cached_journal)
	return cached_journal['jtitle']

	cleanquery = query.replace(" ", "+")
	url = "http://www.sherpa.ac.uk/romeo/api29.php?jtitle=" + cleanquery + "&qtype=" + query_type
	self.parser.parse_url(url)
	self.error = self.parser.error
	self.error_message = self.parser.error_message
	if not self.error:
	return self.parser.get_journals(attribute='jtitle')

	def search_issn(self, issn):
	""" Search for Journals based on ISSN """

	url = "http://www.sherpa.ac.uk/romeo/api29.php?issn=" + issn
	self.parser.parse_url(url)
	self.error = self.parser.error
	self.error_message = self.parser.error_message
	if not self.error:
	return self.parser.get_journals()

	def get_num_hits(self):
	return int(self.parser.xml['header']['numhits'])


	class SherpaRomeoXMLParser(object):

	def __init__(self):
	self.parsed = False
	self.single_item = False
	self.error = False
	self.error_message = ""

	def parse_url(self, url):
	self.url = url
	#example
	#url = 'http://www.sherpa.ac.uk/romeo/api29.php?jtitle=Annals%20of%20Physics'

	found_journal = url.find("jtitle=")
	found_publisher = url.find("pub=")
	if found_journal != -1:
	self.search_type = "journal_search"
	self.query = url[found_journal + 7:(len(url) - 15)]
	elif found_publisher != -1:
	self.search_type = "publisher_search"
	found_publisher = url.find("pub=")
	self.query = url[found_publisher + 4:len(url)]
	else:
	self.search_type = "issn_search"
	found_publisher = url.find("issn=")
	self.query = url[found_publisher + 4:len(url)]

	cached_xml = cache.get(self.search_type + ":" + self.query.lower())
	if cached_xml is None:
	try:
	self.data = urllib2.urlopen(url).read()
	except urllib2.HTTPError:
	self.error = True
	return
	try:
	root = ElementTree.XML(self.data)
	except SyntaxError:
	self.error = True
	return
	self.xml = XmlDictConfig(root)
	outcome = self.xml['header']['outcome']
	if outcome != 'failed' and outcome != 'notFound':
	cache.set(self.search_type + ":" + self.query.lower(), self.xml,
	999999999999)
	else:
	self.xml = cached_xml
	#self.data = cached_xml
	#root = ElementTree.XML(self.data)
	#self.xml = XmlDictConfig(root)

	if self.xml['header']['outcome'] == 'failed':
	self.error = True
	self.error_message = self.xml['header']['message']
	self.parsed = True
	self._cache_parsed_xml()

	def _cache_parsed_xml(self):
	""" Caches every Journal and Publisher found in the xml """
	if not self.parsed:
	return

	outcome = self.xml['header']['outcome'] is 'notFound'
	if outcome is 'notFound' or outcome is 'failed':
	return

	if self.xml['header']['outcome'] == 'singleJournal' \
	or self.xml['header']['outcome'] == 'uniqueZetoc':
	journal = self.xml['journals']['journal']
	cache.set("journal:" + journal['jtitle'].lower(), journal, 999999999999)

	if self.xml['header']['outcome'] != 'uniqueZetoc':
	# if the publisher has been indexed by RoMEO
	publisher = self.xml['publishers']['publisher']

	# Associate a Journal with a Publisher key in cache
	cache.set("journal-publisher:" + journal['jtitle'].lower(),
	"publisher:" + publisher['name'].lower(), 999999999999)
	elif self.xml['journals'] is not None:
	for journal in self.xml['journals']['journal']:
	cache.set("journal:" + journal['jtitle'].lower(), journal, 999999999999)

	if self.xml['header']['numhits'] == '1' \
	and self.xml['header']['outcome'] != 'uniqueZetoc':
	publisher = self.xml['publishers']['publisher']
	cache.set("publisher:" + publisher['name'].lower(), publisher, 999999999999)
	elif self.xml['publishers'] is not None:
	for publisher in self.xml['publishers']['publisher']:
	cache.set("publisher:" + publisher['name'].lower(), publisher,
	None)

	def set_single_item(self, journal=None, publisher=None):
	"""
	Used to initialize the parser with items retrieved from cache

	Note: if both a journal and a publisher are defined
	the publisher is associated with the journal
	"""
	self.xml = dict()
	self.xml['header'] = dict()
	self.xml['header']['outcome'] = ''
	if journal is not None:
	self.xml['journals'] = dict()
	self.xml['journals']['journal'] = journal
	self.xml['header']['numhits'] = '1'
	self.parsed = True
	self.single_item = True
	if publisher is not None:
	# Associate a Journal with a Publisher key in cache
	self.xml['header']['outcome'] = 'singleJournal'
	cache.set("journal-publisher:" + journal['jtitle'].lower(),
	"publisher:" + publisher['name'].lower(), 999999999999)
	elif publisher is not None:
	self.xml['header']['outcome'] = 'publisherFound'
	self.xml['header']['numhits'] = '1'
	self.xml['publishers'] = dict()
	self.xml['publishers']['publisher'] = publisher
	self.single_item = True
	self.parsed = True

	def get_single_item(self):
	"""Returns a single item retrieved from cache."""
	if self.single_item:
	return self.xml
	else:
	return None

	def get_journals(self, attribute=None):
	"""Returns a list of journals.

	If an attribute is defined, returns only this attribute from
	every journal.
	"""
	if self.xml['header']['outcome'] == 'notFound' \
	or self.xml['header']['outcome'] == 'failed':
	return []

	if self.xml['header']['outcome'] == 'singleJournal' \
	or self.xml['header']['outcome'] == 'uniqueZetoc' \
	or (self.single_item and self.xml['journals']['journal'] is not None):
	if attribute is None:
	return [self.xml['journals']['journal']]
	elif self.xml['journals']['journal'][attribute] is not None:
	return [self.xml['journals']['journal'][attribute]]
	else:
	return []

	journals = list()
	for j in self.xml['journals']['journal']:
	if attribute is None:
	journals.append(j)
	else:
	journals.append(j[attribute])

	return journals

	def get_publishers(self, attribute=None, journal=None,):
	""" Returns a list of the publishers if a publisher search was made or
	an empty list.

	If a journal is defined, it returns the associated publisher
	for this Journal or None. A journal definition makes the functions
	to query again if the publisher isn't found in the xml or cache.
	Note: If you define a journal, you must have searched for it first.

	If an attribute is defined, returns only this attribute from
	every publisher
	"""

	if self.xml['header']['outcome'] == 'notFound' \
	or self.xml['header']['outcome'] == 'failed':
	return None

	if self.xml['header']['outcome'] == 'singleJournal':
	return self.xml['publishers']['publisher']

	if self.xml['header']['outcome'] == 'uniqueZetoc':
	# the Publisher has not yet been indexed by RoMEO
	return None

	if journal is not None:
	# search the cache for matches
	publisher_key = cache.get("journal-publisher:" + journal.lower())
	if publisher_key is not None:
	return cache.get(publisher_key)

	# Query again sherpa romeo db to get the publisher
	s = SherpaRomeoSearch()
	issn = self.get_journals(attribute='issn')[0]
	if issn is not None:
	s.search_issn(issn)
	return s.parser.get_publishers()
	else:
	return None

	publishers = list()
	if self.xml['header']['outcome'] == 'publisherFound':
	if self.xml['header']['numhits'] == '1':
	p = self.xml['publishers']['publisher']
	if attribute is None:
	publishers.append(p)
	else:
	if p[attribute] is None:
	return []
	publishers.append(p[attribute])
	else:
	for p in self.xml['publishers']['publisher']:
	if attribute is None:
	publishers.append(p)
	else:
	publishers.append(p[attribute])
	return publishers

	def get_issn(self):
	"""Returns the issn if the search returns a single Journal."""

	if 'issn' in self.xml:
	return self.xml['issn']

	if self.xml['header']['outcome'] == 'notFound' \
	or self.xml['header']['outcome'] == 'failed':
	return None

	if self.xml['header']['outcome'] == 'singleJournal'\
	or self.xml['header']['outcome'] == 'uniqueZetoc':
	return self.xml['journals']['journal']['issn']
	else:
	return None
	issns = dict()
	for j in self.xml['journals']['journal']:
	issns[j['jtitle']] = j['issn']

	return issns

sherpa_romeo.pyNo OneTemporaryActions

File Metadata

sherpa_romeo.pyView Options

Event Timeline

sherpa_romeo.py
No OneTemporary
Actions

sherpa_romeo.py
View Options