bibauthorid_utils.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Oct 30, 04:59

bibauthorid_utils.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	'''
	bibauthorid_utils
	Bibauthorid utilities used by many parts of the framework
	'''

	import sys
	import re

	import bibauthorid_config as bconfig
	import bibauthorid_structs as dat

	try:
	from invenio.search_engine import get_record
	except ImportError:
	pass


	def string_partition(s, sep, direc='l'):
	'''
	Partition a string by the first occurrence of the separator.
	Mimics the string.partition function, which is not available in Python2.4

	@param s: string to be partitioned
	@type s: string
	@param sep: separator to partition by
	@type sep: string
	@param dir: direction (left 'l' or right 'r') to search the separator from
	@type dir: string

	@return: tuple of (left or sep, sep, right of sep)
	@rtype: tuple
	'''
	if direc == 'r':
	i = s.rfind(sep)
	else:
	i = s.find(sep)
	if i < 0:
	return (s, '', '')
	else:
	return (s[0:i], s[i:i + 1], s[i + 1:])


	def split_name_parts(name_string, delete_name_additions=True,
	override_surname_sep=''):
	'''
	Splits name_string in three arrays of strings :
	surname, initials (without trailing dot), names
	RETURNS an array containing a string and two arrays of strings.
	delete_name_additions defines if extensions
	e.g. Jr., (Ed.) or (spokesperson)
	will be ignored

	@param name_string: the name to be spli
	@type name: string
	@param delete_name_additions: determines whether to delete name additions
	@type delete_name_additions: boolean
	@param override_surname_sep: Define alternative surname separator
	@type override_surname_sep: string
	@param reverse_name_surname: if true names come first

	@return: list of [surname string, initials list, names list]
	e.g. split_name_parts("Ellis, John R.")
	--> ['Ellis', ['J', 'R'], ['John'], [0]]
	--> ['Ellis', ['K', 'J', 'R'], ['John', 'Rob'], [1,2]]
	@rtype: list of lists
	'''
	if not override_surname_sep:
	surname_separators = bconfig.SURNAMES_SEPARATOR_CHARACTER_LIST
	else:
	surname_separators = ','

	name_separators = bconfig.NAMES_SEPARATOR_CHARACTER_LIST

	if name_separators == "-1":
	name_separators = ',;.=\-\(\)'

	if delete_name_additions:
	name_additions = re.findall('\([.][^\)]\)', name_string)
	for name_addition in name_additions:
	name_string = name_string.replace(name_addition, '')

	surname = ""
	rest_of_name = ""
	found_sep = ''
	name_string = name_string.strip()

	for sep in surname_separators:
	if name_string.count(sep) >= 1:
	found_sep = sep
	surname, rest_of_name = string_partition(name_string, sep)[0::2]
	break

	if not found_sep:
	if name_string.count(" ") > 0:
	rest_of_name, surname = string_partition(name_string, ' ', direc='r')[0::2]
	else:
	return [name_string, [], []]

	if rest_of_name.count(","):
	rest_of_name = string_partition(rest_of_name, ",")[0]

	substitution_regexp = re.compile('[%s]' % (name_separators))
	initials_names_list = substitution_regexp.sub(' ', rest_of_name).split()
	names = []
	initials = []
	positions = []
	pos_counter = 0
	for i in initials_names_list:
	if len(i) == 1:
	initials.append(i.capitalize())
	pos_counter += 1
	else:
	names.append(i.capitalize())
	initials.append(i[0].capitalize())
	positions.append(pos_counter)
	pos_counter += 1

	return [surname, initials, names, positions]


	def split_name_parts_old(name_string, delete_name_additions=True):
	'''
	Splits name_string in three arrays of strings :
	surname, initials (without trailing dot), names
	RETURNS an array containing a string and two arrays of strings.
	delete_name_additions defines if extensions
	e.g. Jr., (Ed.) or (spokesperson)
	will be ignored

	@param name_string: the name to be spli
	@type name: string
	@param delete_name_additions: determines whether to delete name additions
	@type delete_name_additions: boolean

	@return: list of [surname string, initials list, names list]
	e.g. split_name_parts("Ellis, John R.")
	--> ['Ellis', ['J', 'R'], ['John']]
	@rtype: list of lists

	'''
	name_separators = bconfig.NAMES_SEPARATOR_CHARACTER_LIST

	if name_separators == "-1":
	name_separators = '.,;=\-\(\)'

	if delete_name_additions:
	name_additions = re.findall('\([.][^\)]\)', name_string)
	for name_addition in name_additions:
	name_string = name_string.replace(name_addition, '')

	surname, rest_of_name = string_partition(name_string, ',')[0::2]

	if rest_of_name.count(","):
	rest_of_name = string_partition(rest_of_name, ",")[0]

	substitution_regexp = re.compile('[%s]' % (name_separators))
	initials_names_list = substitution_regexp.sub(' ', rest_of_name).split()
	names = []
	initials = []

	for i in initials_names_list:
	if len(i) == 1:
	initials.append(i.capitalize())
	else:
	names.append(i.capitalize())
	initials.append(i[0].capitalize())

	return [surname, initials, names]


	def create_canonical_name(name):
	canonical_name = create_unified_name(name, reverse=True)
	artifact_removal = re.compile("[^a-zA-Z0-9]")
	whitespace_removal = re.compile("[ ]{1,10}")
	canonical_name = artifact_removal.sub(" ", canonical_name)
	canonical_name = whitespace_removal.sub(" ", canonical_name)
	canonical_name = canonical_name.strip().replace(" ", ".")
	return canonical_name

	def create_normalized_name(splitted_name):
	'''
	Creates a normalized name from a given name array. A normalized name
	looks like "Lastname, Firstnames and Initials"

	@param splitted_name: name array from split_name_parts
	@type splitted_name: list in form [string, list, list]

	@return: normalized name
	@rtype: string
	'''
	name = splitted_name[0] + ','

	if not splitted_name[1] and not splitted_name[2]:
	return name

	for i in splitted_name[1]:
	try:
	fname = splitted_name[2][splitted_name[3].index(splitted_name[1].index(i))]
	name = name + ' ' + fname
	except:
	name = name + ' ' + i + '.'
	return name


	def create_unified_name(name, reverse=False):
	'''
	Creates unified name. E.g. Ellis, John Richard T. (Jr.)
	will become Ellis, J. R. T.

	@param name: The name to be unified
	@type name: string

	@param reverse: if true, names come first

	@return: The unified name
	@rtype: string

	'''
	split_name = split_name_parts(name)

	if reverse:
	unified_name = ''
	for i in split_name[1]:
	unified_name += "%s. " % (i)
	unified_name += "%s" % (split_name[0])
	else:
	unified_name = "%s, " % (split_name[0])
	for i in split_name[1]:
	unified_name += "%s. " % (i)

	if unified_name.count("ollabo"):
	unified_name = unified_name.replace("ollaborations", "ollaboration")
	unified_name = unified_name.replace("The ", "")
	unified_name = unified_name.replace("the ", "")
	unified_name = unified_name.replace("For ", "")
	unified_name = unified_name.replace("for ", "")


	return unified_name

	# pylint: disable=R0912
	# pylint: disable=R0913


	def get_field_values_on_condition(bibrecid, get_table="", get_tag="",
	condition_tag="", condition_value="",
	condition="==", source="MEM"):
	'''
	Method to fetch data from a record in the database.
	It is possible to specify a condition in order to get
	only certain fields if condition holds.

	Examples:

	In [2]: bibauthorid_utils.get_field_values_on_condition
	(742535, [100, 700], 'u', 'a', 'Mathieu, Vincent')
	Out[2]: set(['UMH, Mons'])

	In [3]: bibauthorid_utils.get_field_values_on_condition
	(742535, [100, 700], 'u', 'a')
	Out[3]: set(['LPSC, Grenoble', 'UMH, Mons'])

	In [9]: bibauthorid_utils.get_field_values_on_condition
	(742535, [100,700], 'a', 'u', 'UMH, Mons')
	Out[9]: set(['Semay, Claude', 'Mathieu, Vincent'])

	In [4]: bibauthorid_utils.get_field_values_on_condition
	(742535, [100, 700], 'u')
	Out[4]: set(['LPSC, Grenoble', 'UMH, Mons'])

	In [5]: bibauthorid_utils.get_field_values_on_condition
	(742535, [100, 700])
	Out[5]:
	{'100': [([('a', 'Mathieu, Vincent'), ('u', 'UMH, Mons'), ('i', '4286')],
	' ',
	' ',
	'',
	3)],
	'700': [([('a', 'Semay, Claude'), ('u', 'UMH, Mons'), ('i', '4286')],
	' ',
	' ',
	'',
	4),
	([('a', 'Silvestre-Brac, Bernard'),
	('u', 'LPSC, Grenoble'),
	('i', '2240')],
	' ',
	' ',
	'',
	5)]}
	In [6]: bibauthorid_utils.get_field_values_on_condition(1)
	Out[6]:
	{'001': [([], ' ', ' ', '1', 1)],
	'035': [([('a', 'Groom:0965xu'), ('9', 'SPIRESTeX')], ' ', ' ', '', 13)],
	'037': [([('a', 'CALT-68-62')], ' ', ' ', '', 3)],
	'100': [([('a', 'Groom, Donald E.'), ('u', 'Caltech'), ('i', '981')],
	' ',
	' ',
	'',
	4)],
	'245': [([('a',
	'A GENERAL RANGE ENERGY LIGHT OUTPUT PROGRAM FOR HEP')],
	' ',
	' ',
	'',
	5)],
	'260': [([('c', '0965')], ' ', ' ', '', 7)],
	'269': [([('c', '0965-12-01')], ' ', ' ', '', 6)],
	'300': [([('a', '10')], ' ', ' ', '', 8)],
	'690': [([('a', 'Preprint')], 'C', ' ', '', 2)],
	'961': [([('x', '2007-03-02')], ' ', ' ', '', 10),
	([('c', '2007-03-02')], ' ', ' ', '', 11)],
	'970': [([('9', 'DESY'), ('a', 'DESY-404799')], ' ', ' ', '', 9),
	([('a', 'SPIRES-7090030')], ' ', ' ', '', 12)],
	'980': [([('a', 'Citeable')], ' ', ' ', '', 14),
	([('a', 'CORE')], ' ', ' ', '', 15)]}

	@param bibrecid: The id of the record (bibrec) to get
	@type bibrecid: int
	@param get_table: List of one or more tables to look at
	@type get_table: list or string or int or long
	@param get_tag: The value of this tag shall be returned
	@type get_tag: string
	@param condition_tag: First part of the condition. Provide a tag to look up
	@type condition_tag: string
	@param condition_value: Second pard of the condition. Provide a value
	that has to be matched
	@type condition_value: string
	@param condition: Optional value to describe the condition.
	Defaults to "==" and may be any comparison

	@return: set of found values, empty set if no value found.
	@rtype: set or dictionary
	(if get_tag, condition_tag and condition_value are empty)

	'''
	rec = None
	if source == "MEM":
	rec = dat.RELEVANT_RECORDS.get(bibrecid)
	elif source == "API":
	rec = get_record(bibrecid)

	if condition_value and isinstance(condition_value, str):
	condition_value = condition_value.decode('utf-8')

	returnset = set()

	if not rec:
	return set()

	if get_table:
	if not isinstance(get_table, list):
	if isinstance(get_table, str):
	get_table = [get_table]
	elif isinstance(get_table, int) or isinstance(get_table, long):
	get_table = [str(get_table)]
	else:
	sys.stderr.write('Error: Wrong table for table selection. ' +
	'Allowed are list of strings, string or int/long values\n')


	for table in get_table:
	if str(table) in rec:
	if table in ["cites", "cited-by"]:
	return rec[str(table)]

	for recordentries in rec[str(table)]:
	is_condition = True
	is_skip_entry = False

	for field in recordentries[0]:
	if condition_tag and condition_value:
	if field[0] == condition_tag:
	condition_holds = False
	try:
	condition_holds = not eval(("field[1].decode('utf-8') %s"
	+ " condition_value") % (condition))
	except (TypeError, NameError, IndexError):
	condition_holds = False

	if condition_holds:
	is_skip_entry = True
	is_condition = False
	break
	elif get_tag:
	if get_tag == field[0]:
	returnset.add(field[1].decode('utf-8'))
	else:
	retlist = {}

	for table in get_table:
	try:
	retlist[str(table)] = rec[str(table)]
	except KeyError:
	pass

	return retlist

	if is_condition and not is_skip_entry:
	for field in recordentries[0]:
	if field[0] == get_tag:
	returnset.add(field[1].decode('utf-8'))

	if len(returnset) == 0:
	returnset = set()

	return returnset
	else:
	return rec


	# pylint: enable=R0912
	# pylint: enable=R0913


	def str_to_unicode(obj, encoding="utf-8"):
	'''
	Transforms any string object into a unicode object.

	@param obj: the object to be transformed
	@type obj: string or unicode
	@param encoding: the preferred encoding. Defaults to UTF-8
	@type encoding: string

	@return: returns the unicode representation of the object.
	@rtype: basetype::unicode

	'''
	if isinstance(obj, basestring):
	if not isinstance(obj, unicode):
	obj = unicode(obj, encoding)
	return obj


	def str_to_int(string_value):
	'''
	Transforms a string into an int value

	@param string_value: The string representation of an integer
	@type string_value: string

	@return: The int value of the string
	@rtype: int

	'''
	return int(''.join([c for c in string_value if c.isdigit()]))


	def clean_name_string(namestring, replacement=" ", keep_whitespace=True,
	trim_whitespaces=False):
	'''
	remove specific artifacts from the names in order to be able to
	compare them. E.g. 't Hooft, G. and t'Hooft, G.

	@param namestring: the string to be cleaned
	@type namestring: string
	'''
	# artifact_removal = re.compile("['`\-\[\]\_\"]")
	artifact_removal = None

	if trim_whitespaces:
	namestring.strip()

	if keep_whitespace:
	artifact_removal = re.compile("[^a-zA-Z0-9,.\s]")
	else:
	artifact_removal = re.compile("[^a-zA-Z0-9,.]")

	whitespace_removal = re.compile("[\s]{2,100}")
	tmp = artifact_removal.sub(replacement, namestring)

	# print namestring, "->", whitespace_removal.sub(" ", tmp).strip()

	return whitespace_removal.sub(" ", tmp).strip()

bibauthorid_utils.pyNo OneTemporaryActions

File Metadata

bibauthorid_utils.pyView Options

Event Timeline

bibauthorid_utils.py
No OneTemporary
Actions

bibauthorid_utils.py
View Options