bibauthorid_authorname_utils.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Aug 20, 15:11

bibauthorid_authorname_utils.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	'''
	bibauthorid_authornames_utils
	Helper for accessing the author names data structure
	'''

	import bibauthorid_utils
	from copy import deepcopy

	from bibauthorid_utils import clean_name_string
	from bibauthorid_utils import split_name_parts

	import bibauthorid_structs as dat
	import bibauthorid_config as bconfig


	def get_bibrefs_by_authornames_id(authornames_id):
	'''
	Finds actual ids of the author name as it appears in bib10x or bib70x

	@param authornames_id: id in aidAUTHORNAMES
	@return: A list of sets.
	- The first set in the list contains all ids in bib10x
	- The second set in the list contains all ids in bib70x
	@rtype: list of sets
	'''

	bibrefs = ''
	bibref_str = [row['bibrefs'] for row in dat.AUTHOR_NAMES
	if row['id'] == authornames_id]

	if len(bibref_str) > 0:
	bibrefs = bibref_str.split(",")

	b100 = set()
	b700 = set()

	for bibref in bibrefs:
	tag, refid = bibref.split(':')
	if tag == "100":
	b100.add(int(refid))
	elif tag == "700":
	b700.add(int(refid))
	else:
	bconfig.LOGGER.error("Wrong bibref Tag...how did you do that?")

	return [b100, b700]


	def name_matching(orig_name, target_name):
	"""
	Checks the compatibility of the given names.

	@param orig_name: The original name String
	@type orig_name: string
	@param target_name: The target name string
	@type target_name: string

	@return: true or false in respect to the compatibility of the given names
	@rtype: boolean
	"""
	orig = bibauthorid_utils.split_name_parts(orig_name)
	targ = bibauthorid_utils.split_name_parts(target_name)

	if (len(orig[1]) == 0) or (len(targ[1]) == 0):
	return True

	else:
	initials_set = set(orig[1])
	names_set = set(orig[2])
	comp_initials_set = set(targ[1])
	comp_names_set = set(targ[2])

	names_intersection = names_set.intersection(comp_names_set)
	initials_intersection = initials_set.intersection(comp_initials_set)

	if len(initials_intersection) == 0:
	if len(names_intersection) != 0:
	bconfig.LOGGER.error("length of names intersection != 0..."
	"This should never happen!")

	if ((len(names_intersection) == 0) and (len(comp_names_set) > 0)
	and (len(names_set) > 0)):
	return False

	if orig[1][0] == targ[1][0]:
	return True

	return False


	def search_matching_names(authorname_string, match_function=name_matching,
	consider_surname_only=True):
	"""
	search for matching names give a matching function.
	@warning: searching for matching name with consider_surname_only=false
	will be painfully slow! You've been warned.

	@warning: for mental sanity purposes the surnames not ending with a comma
	are being ignored;
	if you're searching for a surname without comma or names, the comma is
	being added automatically to the end of the string.

	@param authorname_string: The author name string
	@type authorname_string: string
	@param match_function: The function to use for the name matching
	@type match_function: function descriptor
	@param consider_surname_only: Decides if only names with the same
	surname shall be considered or _all_ other names.
	@type consider_surname_only: boolean

	@return: an array containing a tuple
	@rtype: list of tuples

	@note: example:
	search_matching_names('einstein, albert')
	Out[7]: [[(962L, 'Einstein, Albert'), ['Einstein', ['A'], ['Albert']]],
	[(1128L, 'Einstein, A.'), ['Einstein', ['A'], []]]]
	"""
	possible_names = []
	names = []

	if authorname_string.count(',') == 0:
	authorname_string += ','

	authorname = bibauthorid_utils.split_name_parts(authorname_string)

	if consider_surname_only:
	names = [row for row in dat.AUTHOR_NAMES
	if row['name'].startswith(authorname[0])]
	else:
	names = [row for row in dat.AUTHOR_NAMES]

	for name in names:
	if match_function(authorname_string, name['name']):
	possible_names.append([(name['id'], name['name']),
	bibauthorid_utils.split_name_parts(name['name'])])
	return possible_names


	def get_name_id(name_string):
	"""
	@return: the id associated to a given string in the authornames table.
	Returns -1 if the string is not found.
	@return: int
	"""
	name_id = -1
	name = [row['id'] for row in dat.AUTHOR_NAMES
	if row['name'] == name_string]
	try:
	name_id = name[0]
	except (IndexError, ValueError):
	name_id = -1

	return name_id


	def get_name_string(authorname_id):
	'''
	Get name representation for an ID in authornames table
	@return: the name string associated with a particular authorid in the
	authornames table.
	If the ID is not found returns an empty string.
	@rtype: string
	'''
	name_string = ""
	name = [row['name'] for row in dat.AUTHOR_NAMES
	if row['id'] == authorname_id]

	if len(name) > 0:
	name_string = name[0]

	return name_string


	def get_db_name_string(authorname_id):
	'''
	Get name representation for an ID in authornames table
	@return: the name string associated with a particular authorid in the
	authornames table.
	If the ID is not found returns an empty string.
	@rtype: string
	'''
	name_string = ""
	name = [row['db_name'] for row in dat.AUTHOR_NAMES
	if row['id'] == authorname_id]

	if len(name) > 0:
	name_string = name[0]

	return name_string


	def get_name_and_db_name_strings(authorname_id):
	'''
	Get name representation for an ID in authornames table
	@return: the name string and the db name string associated with a
	particular authornameid in the authornames table.
	If the ID is not found returns empty values for the dict keys.
	@rtype: dict
	'''
	names_dict = {"name": "",
	"db_name": ""}
	name = [row for row in dat.AUTHOR_NAMES
	if row['id'] == authorname_id]

	if len(name) > 0:
	names_dict["name"] = name[0]['name']
	names_dict["db_name"] = name[0]['db_name']

	return names_dict


	def get_name_bibrefs(authorname_id):
	"""
	Finds the bibrefID from authorname_id.

	@param authorname_id: ID of the author name to look up the bibliographic
	reference for
	@type authorname_id: int

	@return: the bibrefs associated with a particular authorid in the
	authornames table. If the ID is not found, an empty string
	shall be returned.
	@rtype: string
	"""
	bibref_string = ""
	bibrefs = [row['bibrefs'] for row in dat.AUTHOR_NAMES
	if row['id'] == authorname_id]
	if len(bibrefs) > 0:
	bibref_string = bibrefs[0]

	return bibref_string


	def update_doclist(bibrec_id, authorname_id="", bibref=""):
	"""
	Update doclist table given bibrec_id and processed author. (inserts a new
	document in the doclist table)

	@return: True if a new bibrecord has been added, false if this
	bibrecord was previously processed
	@rtype: boolean
	"""
	records = [row for row in dat.DOC_LIST
	if row['bibrecid'] == bibrec_id]

	if len(records) > 0:
	# @note maybe it's better to have a comma-separated list in the
	# 'authorname_id' column. That would keep the DB size
	# lower. First steps for the implementation introduced; update
	# procedure necessary. Descision might be harder.
	# Performance tests might help.
	for record in records:
	refrec = (authorname_id, bibref)

	if ((authorname_id) and
	(authorname_id not in record['authornameids']) and
	(refrec not in record['authornameid_bibrefrec'])):
	record['authornameids'] += [authorname_id]
	record['authornameid_bibrefrec'] += [refrec]
	elif ((authorname_id) and
	(authorname_id in record['authornameids']) and
	(refrec not in record['authornameid_bibrefrec'])):
	record['authornameid_bibrefrec'] += [refrec]
	else:
	bconfig.LOGGER.warn("The author has already been processed on."
	" the record. That's OK. Skipping entry.")
	return False
	else:
	if authorname_id:
	refrec = (authorname_id, bibref)
	dat.DOC_LIST.append({'bibrecid': bibrec_id,
	'authornameids': [authorname_id],
	'authornameid_bibrefrec': [refrec]})
	else:
	dat.DOC_LIST.append({'bibrecid': bibrec_id,
	'authornameids': [],
	'authornameid_bibrefrec': []})

	return True


	def soft_compare_names(origin_name, target_name):
	'''
	Soft comparison of names, to use in search engine an similar
	Base results:
	If surname is equal in [0.6,1.0]
	If surname similar in [0.4,0.8]
	If surname differs in [0.0,0.4]
	all depending on average compatibility of names and initials.
	'''
	jaro_fctn = None

	try:
	from Levenshtein import jaro_winkler
	jaro_fctn = jaro_winkler
	except ImportError:
	jaro_fctn = jaro_winkler_str_similarity
	score = 0.0
	oname = deepcopy(origin_name)
	tname = deepcopy(target_name)
	orig_name = split_name_parts(oname.lower())
	targ_name = split_name_parts(tname.lower())
	orig_name[0] = clean_name_string(orig_name[0],
	replacement="",
	keep_whitespace=False)
	targ_name[0] = clean_name_string(targ_name[0],
	replacement="",
	keep_whitespace=False)
	if orig_name[0] == targ_name[0]:
	score += 0.6
	else:
	if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
	or min(len(orig_name[0]), len(targ_name[0])) <= 4):
	score += 0.0
	else:
	score += 0.4

	if orig_name[1] and targ_name[1]:
	max_initials = max(len(orig_name[1]), len(targ_name[1]))
	matching_i = 0
	if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1:
	for i in orig_name[1]:
	if i in targ_name[1]:
	matching_i += 1
	max_names = max(len(orig_name[2]), len(targ_name[2]))
	matching_n = 0
	if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1:
	cleaned_targ_name = [clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2]]
	for i in orig_name[2]:
	if clean_name_string(i, replacement="", keep_whitespace=False) in cleaned_targ_name:
	matching_n += 1

	name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials)
	score += name_score
	return score


	def compare_names(origin_name, target_name):
	"""
	Compute an index of confidence that would like to indicate whether two
	names might represent the same person.The computation is based on
	similarities of name structure, in particular:
	Initials:
	We assign an high score if all the initials matches are in the
	right order, much lower if they are in the wrong order
	Names:
	We assign a lower score for mismatching names and higher score for
	fully matching names
	If there is nothing to compare we are forced to assume a high score.

	Example for splitting names:
	In : bibauthorid.split_name_parts("Ellis, John R")
	Out: ['Ellis', ['J', 'R'], ['John']]

	Ellis, R. Keith => [ [Ellis], [R, K], [Keith] ]
	Ellis, Richard Keith => [ [Ellis], [R, K], [Richard, Keith] ]

	Since the initials are computed whether on the real initials present in the
	name string and using the full name, if there is no initials match we are 1
	00% confident that:
	1. we have no names/initials at all, or
	2. we have completely different names; hence if there is no initial
	match we skip this step.

	@param orig_name: The first author's last name, first name(s) and initial
	@type orig_name: list of strings and lists of strings
	@param targ_name: The second author's last name, first name(s) and initial
	@type targ_name: list of strings and lists of strings

	@return: a value that describes the likelihood of the names being the same
	@rtype: float
	"""

	jaro_fctn = None

	try:
	from Levenshtein import jaro_winkler
	jaro_fctn = jaro_winkler
	except ImportError:
	jaro_fctn = jaro_winkler_str_similarity

	oname = deepcopy(origin_name)
	tname = deepcopy(target_name)

	orig_name = split_name_parts(oname.lower())
	targ_name = split_name_parts(tname.lower())

	bconfig.LOGGER.info("\|--> Comparing Names: \"%s\" and \"%s\"" %
	(origin_name, target_name))

	lastname_modifier = 0.0

	if not (orig_name[0] == targ_name[0]):
	# last names are not equal before cleaning them. Assign entry penalty.
	lastname_modifier = 0.15

	orig_name[0] = clean_name_string(orig_name[0],
	replacement="",
	keep_whitespace=False)
	targ_name[0] = clean_name_string(targ_name[0],
	replacement="",
	keep_whitespace=False)

	if not (orig_name[0] == targ_name[0]):
	if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
	or min(len(orig_name[0]), len(targ_name[0])) <= 4):
	bconfig.LOGGER.warn(("Unequal lastnames(%s vs. %s)."
	+ "Skipping Comparison")
	% (orig_name[0], targ_name[0]))
	return 0.0
	else:
	bconfig.LOGGER.log(25, "Last names are not equal; "
	+ "but similar enough to continue the comparison")
	# Let it go through...however, reduce the final result a little.
	lastname_modifier = 0.24
	else:
	# last names are equal after cleaning them. Reduce penalty.
	if lastname_modifier == 0.15:
	lastname_modifier = 0.02

	if orig_name[2] and targ_name[2]:
	if len(orig_name[2]) > 1 or len(targ_name[2]) > 1:
	variation_ps = []
	oname_variations = create_name_tuples(orig_name[2])
	tname_variations = create_name_tuples(targ_name[2])

	for oname_variation in oname_variations:
	for tname_variation in tname_variations:
	oname_var = split_name_parts("%s, %s"
	% (orig_name[0],
	oname_variation))
	tname_var = split_name_parts("%s, %s"
	% (targ_name[0],
	tname_variation))
	variation_ps.append(_perform_matching(oname_var,
	tname_var))

	return max(variation_ps) - lastname_modifier

	return _perform_matching(orig_name, targ_name) - lastname_modifier


	def _perform_matching(orig_name, targ_name):
	'''

	@param orig_name:
	@type orig_name:
	@param targ_name:
	@type targ_name:
	'''
	tname = deepcopy(targ_name)
	oname = deepcopy(orig_name)

	potential_name_matches = min(len(oname[2]), len(tname[2]))
	names_p_weight = 0.0
	initials_p_weight = _compare_initials(oname, tname)

	if initials_p_weight > 0.0:
	names_p_weight = _compare_first_names(oname, tname)

	names_w = .5
	ini_w = .5

	if (names_p_weight > 0.6) and (potential_name_matches > 0):
	names_w = .7
	ini_w = .3

	if (initials_p_weight == 1.0) and (len(oname[1]) != len(tname[1])):
	initials_p_weight -= .1

	if (names_p_weight == 1.0) and ((len(oname[2]) != len(tname[2]))
	or not len(oname[2])) and (potential_name_matches < 2):
	names_p_weight -= .1

	if (initials_p_weight == 1.0) and (names_p_weight <= 0):
	names_w = 0.
	ini_w = 0.

	res = names_w * names_p_weight + ini_w * initials_p_weight

	# print "\|--> Comparing Names: %s and %s" % (oname, tname)
	bconfig.LOGGER.debug(("\|---> iWeight (%s) * ip (%s) + nWeight " +
	"(%s) * nP (%s) = %s") % (ini_w, initials_p_weight,
	names_w, names_p_weight, res))

	return (names_w * names_p_weight + ini_w * initials_p_weight)


	def _compare_initials(orig_name, targ_name):
	'''
	Compares Author's initials and returns the assigned score.

	@param orig_name: The first author's last name, first name(s) and initial
	@type orig_name: list of strings and lists of strings
	@param targ_name: The second author's last name, first name(s) and initial
	@type targ_name: list of strings and lists of strings

	@return: a value describing the likelihood of the initials being the same
	@rtype: float
	'''
	# determine minimal number of initials and declare the
	# count of max. possible matches
	tname = deepcopy(targ_name)
	oname = deepcopy(orig_name)

	max_possible_matches = min(len(oname[1]), len(tname[1]))
	initial_weight_denominator = (float(1 + max_possible_matches) /
	2.0) * max_possible_matches
	initials_p_weight = 0.0

	if max_possible_matches > 0:
	for index, item in enumerate(oname[1]):
	# print "\|---> Trying Initial: ", I
	if index < max_possible_matches:
	try:
	targ_index = tname[1].index(item)

	if index == targ_index:
	initials_p_weight += (
	float(index + 1) / initial_weight_denominator)
	else:
	initials_p_weight += 1. / (5 * max_possible_matches *
	abs(index - targ_index))
	tname[1][targ_index] = ''
	except (IndexError, ValueError, ZeroDivisionError):
	# initials_p_weight = 0.1
	break
	else:
	initials_p_weight = 0.0

	return initials_p_weight


	def _compare_first_names(orig_name, targ_name):
	'''
	Compares Author's first names and returns the assigned score.

	@param orig_name: The first author's last name, first name(s) and initial
	@type orig_name: list of strings and lists of strings
	@param targ_name: The second author's last name, first name(s) and initial
	@type targ_name: list of strings and lists of strings

	@return: a value that describes the likelihood of the names being the same
	@rtype: float
	'''
	# determine minimal number of names and declare the
	# count of max. possible matches

	string_similarity = None

	try:
	from Levenshtein import jaro_winkler
	string_similarity = jaro_winkler
	except ImportError:
	string_similarity = jaro_winkler_str_similarity

	tname = deepcopy(targ_name)
	oname = deepcopy(orig_name)

	names_p_weight = 0.0
	max_possible_matches = float(min(len(oname[2]), len(tname[2])))
	name_weight_denominator = ((1.0 + max_possible_matches)
	/ 2.0 * max_possible_matches)
	equal_set = set(oname[2]).intersection(set(tname[2]))
	equal_names = [i for i in oname[2] if i in equal_set]

	if max_possible_matches < 1.:
	return 1.0

	if len(equal_names) == max_possible_matches:
	for index, item in enumerate(equal_names):
	if index <= max_possible_matches:
	try:
	targ_index = tname[2].index(item)
	initial_index = oname[1].index(item[0].upper())

	if (index == targ_index) or (initial_index == targ_index):
	names_p_weight += (float(index + 1) /
	float(name_weight_denominator))
	else:
	names_p_weight += 1. / (2 * max_possible_matches *
	abs(index - targ_index))
	tname[2][targ_index] = ''
	except (IndexError, ValueError, ZeroDivisionError):
	break

	else:
	fuzzy_matches = 0
	wrong_position_modifier = 0

	# for name1 in oname[2]:
	# for name2 in tname[2]:
	# similarity = string_similarity(name1, name2)
	# if similarity > 0.91:
	# fuzzy_matches += 1
	# if oname[2].index(name1) != tname[2].index(name2):
	# wrong_position_modifier += 0.05
	for name1 in oname[2]:
	for name2 in tname[2]:
	fuzzy_matches += string_similarity(name1, name2)
	if oname[2].index(name1) != tname[2].index(name2):
	wrong_position_modifier += 0.05

	if fuzzy_matches > 0:
	num_comparisons = len(oname[2]) * len(tname[2])
	names_p_weight = (fuzzy_matches / num_comparisons -
	wrong_position_modifier)
	else:
	names_p_weight = -0.3

	return names_p_weight


	def create_name_tuples(names):
	'''
	Find name combinations, i.e. permutations of the names in different
	positions of the name

	@param names: a list of names
	@type names: list of string

	@return: the combinations of the names given
	@rtype: list of lists of strings
	'''
	length = float(len(names))
	max_tuples = int((length / 2) * (length - 1))
	current_tuple = 1
	pos = 0
	off = 1
	variants = [" ".join(names)]

	for i in range(max_tuples):
	variant = "%s %s %s" % (' '.join(names[0:pos]),
	''.join(names[pos:off + 1]).capitalize(),
	' '.join(names[off + 1::]))
	variants.append(variant.strip())
	pos += 1
	off += 1

	if off >= length:
	pos = i * 0
	off = current_tuple + 1
	current_tuple += 1

	return variants


	def jaro_str_distance(str1, str2):
	"""
	The Jaro string similarity algorithm as described in
	'Jaro, M.A. (1989): "Advances in record linkage methodology as applied
	to the 1985 census of Tampa Florida'

	@param str1: The first string
	@type str1: string
	@param str2: The second string
	@type str2: string

	@return: approximate string comparison measure (between 0.0 and 1.0)
	@rtype: float
	"""

	if (not str1) or (not str2):
	return 0.0
	elif str1 == str2:
	return 1.0

	jaro_marker = chr(1)

	len1 = len(str1)
	len2 = len(str2)

	halflen = max(len1, len2) / 2 + 1

	assignments1 = ''
	assignments2 = ''

	workstr1 = str1
	workstr2 = str2

	common1 = common2 = 0

	# Analyze the first string
	for i in xrange(len1):
	start = max(0, i - halflen)
	end = min(i + halflen + 1, len2)
	index = workstr2.find(str1[i], start, end)

	if index > -1: # Found common character
	common1 += 1
	assignments1 = assignments1 + str1[i]
	workstr2 = workstr2[:index] + jaro_marker + workstr2[index + 1:]

	# Analyze the second string
	for i in xrange(len2):
	start = max(0, i - halflen)
	end = min(i + halflen + 1, len1)
	index = workstr1.find(str2[i], start, end)

	if (index > -1): # Found common character
	common2 += 1
	assignments2 = assignments2 + str2[i]
	workstr1 = workstr1[:index] + jaro_marker + workstr1[index + 1:]

	common = float(common1 + common2) / 2.0

	if (common == 0):
	return 0.0

	transpositions = 0

	for i in xrange(len(assignments1)):
	if (assignments1[i] != assignments2[i]):
	transpositions += 1

	transpositions /= 2.0

	common = float(common)
	len1 = float(len1)
	len2 = float(len2)
	jaro_constant = 1.0 / 3.0
	jaro_transpositions = (common1 - transpositions) / common1
	jaro_common_to_len_ratio = common1 / len1 + common1 / len2

	dist = jaro_constant * (jaro_common_to_len_ratio + jaro_transpositions)

	return dist


	def _winkler_modifier(str1, str2, in_weight):
	"""
	Applies the winkler modifier to a score obtained by the Jaro string
	similarity measure. This is described in Winkler, W.E. (1999) "The state
	of record linkage and current research problems".

	If the first characters of the two strings (up to first 4) are identical,
	the similarity weight will be increased.

	@param str1: The first string
	@type str1: string
	@param str2: The second string
	@type str2: string
	@param in_weight: Similarity score obtained by the Jaro algorithm
	@type in_weight: float

	@return: approximate string comparison measure (between 0.0 and 1.0)
	@rtype: float
	"""
	if (not str1) or (not str2):
	return 0.0
	elif str1 == str2:
	return 1.0

	# Compute how many characters are common at beginning
	minlen = min(len(str1), len(str2))
	common_chars_num = 0

	for common_chars_num in xrange(1, minlen + 1):
	if str1[:common_chars_num] != str2[:common_chars_num]:
	break

	common_chars_num -= 1

	if (common_chars_num > 4):
	common_chars_num = 4

	winkler_weight = in_weight + common_chars_num * 0.1 * (1.0 - in_weight)

	final_result = 0.0

	if winkler_weight >= 0.0 and winkler_weight <= 1.0:
	final_result = winkler_weight
	elif winkler_weight > 1.0:
	final_result = 1.0

	return final_result


	def jaro_winkler_str_similarity(str1, str2):
	"""
	For backwards compatibility, call Jaro followed by Winkler modification.

	@param str1: The first string
	@type str1: string
	@param str2: The second string
	@type str2: string

	@return: approximate string comparison measure (between 0.0 and 1.0)
	@rtype: float
	"""
	jaro_weight = jaro_str_distance(str1, str2)

	return _winkler_modifier(str1, str2, jaro_weight)


	def names_are_equal_composites(name1, name2):
	'''
	Checks if names are equal composites; e.g. "guangsheng" vs. "guang sheng"

	@param name1: Name string of the first name (w/ last name)
	@type name1: string
	@param name2: Name string of the second name (w/ last name)
	@type name2: string

	@return: Are the names equal composites?
	@rtype: boolean
	'''
	if not isinstance(name1, list):
	name1 = split_name_parts(name1)

	if not isinstance(name2, list):
	name2 = split_name_parts(name2)

	is_equal_composite = False
	oname_variations = create_name_tuples(name1[2])
	tname_variations = create_name_tuples(name2[2])

	for oname_variation in oname_variations:
	for tname_variation in tname_variations:
	oname = clean_name_string(oname_variation.lower(), "", False, True)
	tname = clean_name_string(tname_variation.lower(), "", False, True)

	if oname == tname:
	is_equal_composite = True
	break

	return is_equal_composite


	def names_are_equal_gender(name1, name2, gendernames):
	'''
	Checks on gender equality of two names baes on a word list

	@param name1: Name string of the first name (w/ last name)
	@type name1: string
	@param name2: Name string of the second name (w/ last name)
	@type name2: string
	@param gendernames: dictionary of male/female names
	@type gendernames: dict

	@return: Are names gender-equal?
	@rtype: boolean
	'''
	if not isinstance(name1, list):
	name1 = split_name_parts(name1)

	if not isinstance(name2, list):
	name2 = split_name_parts(name2)

	print_debug = False
	names_are_equal_gender_b = True
	ogender = None
	tgender = None
	oname = name1[2][0].lower()
	tname = name2[2][0].lower()
	oname = clean_name_string(oname, "", False, True)
	tname = clean_name_string(tname, "", False, True)

	if oname in gendernames['boys']:
	ogender = 'Male'
	elif oname in gendernames['girls']:
	ogender = 'Female'

	if tname in gendernames['boys']:
	tgender = 'Male'
	elif tname in gendernames['girls']:
	tgender = 'Female'

	if print_debug:
	print ' Gender check: ', oname, ' is a ', ogender
	print ' Gender check: ', tname, ' is a ', tgender

	if ogender and tgender:
	if ogender != tgender:
	if print_debug:
	print ' Gender differs, force split!'

	names_are_equal_gender_b = False

	return names_are_equal_gender_b


	def names_are_synonymous(name1, name2, name_variations):
	'''
	Checks if two names are synonymous; e.g. "Robert" vs. "Bob"

	@param name1: Name string of the first name (w/ last name)
	@type name1: string
	@param name2: Name string of the second name (w/ last name)
	@type name2: string
	@param name_variations: name variations list
	@type name_variations: list of lists

	@return: are names synonymous
	@rtype: boolean
	'''
	if not isinstance(name1, list):
	name1 = split_name_parts(name1)

	if not isinstance(name2, list):
	name2 = split_name_parts(name2)

	print_debug = False
	names_are_synonymous_b = False
	max_matches = min(len(name1[2]), len(name2[2]))
	matches = []

	for i in xrange(max_matches):
	matches.append(False)

	for nvar in name_variations:
	for i in xrange(max_matches):
	oname = name1[2][i].lower()
	tname = name2[2][i].lower()
	oname = clean_name_string(oname, "", False, True)
	tname = clean_name_string(tname, "", False, True)

	if oname in nvar and tname in nvar:
	if print_debug:
	print ' ', oname, ' and ', tname, ' are synonyms! Not splitting!'

	matches[i] = True

	if sum(matches) == max_matches:
	names_are_synonymous_b = True
	break

	return names_are_synonymous_b


	def names_are_substrings(name1, name2):
	'''
	Checks if two names are substrings of each other; e.g. "Christoph" vs. "Ch"
	Only checks for the beginning of the names.

	@param name1: Name string of the first name (w/ last name)
	@type name1: string
	@param name2: Name string of the second name (w/ last name)
	@type name2: string

	@return: are names synonymous
	@rtype: boolean
	'''
	if not isinstance(name1, list):
	name1 = split_name_parts(name1)

	if not isinstance(name2, list):
	name2 = split_name_parts(name2)

	onames = name1[2]
	tnames = name2[2]
	# oname = "".join(onames).lower()
	# tname = "".join(tnames).lower()
	oname = clean_name_string("".join(onames).lower(), "", False, True)
	tname = clean_name_string("".join(tnames).lower(), "", False, True)
	names_are_substrings_b = False

	if (oname.startswith(tname)
	or tname.startswith(oname)):
	names_are_substrings_b = True

	return names_are_substrings_b


	def names_minimum_levenshtein_distance(name1, name2):
	'''
	Determines the minimum distance D between two names.
	Comparison is base on the minimum number of first names.
	Examples:
	D("guang", "guang sheng") = 0
	D("guang", "guangsheng") = 5
	D("guang sheng", "guangsheng") = 5
	D("guang sheng", "guang shing") = 1
	D("guang ming", "guang fin") = 2

	@precondition: Names have been checked for composition equality.
	@param name1: Name string of the first name (w/ last name)
	@type name1: string
	@param name2: Name string of the second name (w/ last name)
	@type name2: string

	@return: the minimum Levenshtein distance between two names
	@rtype: int
	'''
	try:
	from Levenshtein import distance
	except ImportError:
	bconfig.LOGGER.exception("Levenshtein Module not available!")
	return - 1

	if not isinstance(name1, list):
	name1 = split_name_parts(name1)

	if not isinstance(name2, list):
	name2 = split_name_parts(name2)

	onames = name1[2]
	tnames = name2[2]
	# min_names_count = min(len(onames), len(tnames))
	#
	# if min_names_count <= 0:
	# return -1
	#
	# oname = "".join(onames[:min_names_count]).lower()
	# tname = "".join(tnames[:min_names_count]).lower()
	oname = clean_name_string("".join(onames).lower(), "", False, True)
	tname = clean_name_string("".join(tnames).lower(), "", False, True)

	return distance(oname, tname)

bibauthorid_authorname_utils.pyNo OneTemporaryActions

File Metadata

bibauthorid_authorname_utils.pyView Options

Event Timeline

bibauthorid_authorname_utils.py
No OneTemporary
Actions

bibauthorid_authorname_utils.py
View Options