bibauthorid_tables_utils.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Jul 3, 17:14

bibauthorid_tables_utils.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""
	bibauthorid_tables_utils
	Bibauthorid's DB handler
	"""
	import sys
	import re
	import random

	import bibauthorid_config as bconfig
	import bibauthorid_structs as dat

	from search_engine import get_record
	from bibrank_citation_searcher import get_citation_dict
	from dbquery import run_sql
	from dbquery import OperationalError, ProgrammingError
	from bibauthorid_utils import split_name_parts, create_normalized_name
	from bibauthorid_utils import clean_name_string
	from bibauthorid_authorname_utils import update_doclist


	def get_papers_recently_modified(date=''):
	'''
	Returns the bibrecs with modification date more recent then date, or all
	the bibrecs if no date is specified.
	@param date: date
	'''
	papers = run_sql("select id from bibrec where modification_date > %s",
	(str(date),))
	if papers:
	bibrecs = [i[0] for i in papers]
	bibrecs.append(-1)
	min_date = run_sql("select max(modification_date) from bibrec where "
	"id in %s", (tuple(bibrecs),))
	else:
	min_date = run_sql("select now()")
	return papers, min_date

	def populate_authornames_bibrefs_from_authornames():
	'''
	Populates aidAUTHORNAMESBIBREFS.

	For each entry in aidAUTHORNAMES creates a corresponding entry in aidA.B. so it's possible to search
	by bibrec/bibref at a reasonable speed as well and not only by name.
	'''
	nids = run_sql("select id,bibrefs from aidAUTHORNAMES")
	for nid in nids:
	for bibref in nid[1].split(','):
	if bconfig.TABLES_UTILS_DEBUG:
	print ('populate_authornames_bibrefs_from_authornames: Adding: '
	' %s %s' % (str(nid[0]), str(bibref)))

	run_sql("insert into aidAUTHORNAMESBIBREFS (Name_id, bibref) "
	"values (%s,%s)", (str(nid[0]), str(bibref)))

	def authornames_tables_gc(bunch_size=50):
	'''
	Performs garbage collecting on the authornames tables.
	Potentially really slow.
	'''
	bunch_start = run_sql("select min(id) from aidAUTHORNAMESBIBREFS")
	if len(bunch_start) >= 1:
	bunch_start = int(bunch_start[0][0])
	else:
	return

	abfs_ids_bunch = run_sql("select id,Name_id,bibref from aidAUTHORNAMESBIBREFS limit %s, %s"
	, (str(bunch_start - 1), str(bunch_size)))
	bunch_start += bunch_size

	while len(abfs_ids_bunch) >= 1:
	bib100list = []
	bib700list = []
	for i in abfs_ids_bunch:
	if i[2].split(':')[0] == '100':
	bib100list.append(i[2].split(':')[1])
	elif i[2].split(':')[0] == '700':
	bib700list.append(i[2].split(':')[1])

	bib100liststr = '( '
	for i in bib100list:
	bib100liststr += "'" + str(i) + "',"
	bib100liststr = bib100liststr[0:len(bib100liststr) - 1] + " )"

	bib700liststr = '( '
	for i in bib700list:
	bib700liststr += "'" + str(i) + "',"
	bib700liststr = bib700liststr[0:len(bib700liststr) - 1] + " )"

	if len(bib100list) >= 1:
	bib10xids = run_sql("select id from bib10x where id in %s"
	% bib100liststr)
	else:
	bib10xids = []

	if len(bib700list) >= 1:
	bib70xids = run_sql("select id from bib70x where id in %s"
	% bib700liststr)
	else:
	bib70xids = []

	bib10xlist = []
	bib70xlist = []

	for i in bib10xids:
	bib10xlist.append(str(i[0]))
	for i in bib70xids:
	bib70xlist.append(str(i[0]))

	bib100junk = set(bib100list).difference(set(bib10xlist))
	bib700junk = set(bib700list).difference(set(bib70xlist))

	idsdict = {}
	for i in abfs_ids_bunch:
	idsdict[i[2]] = [i[0], i[1]]

	junklist = []
	for i in bib100junk:
	junklist.append('100:' + i)
	for i in bib700junk:
	junklist.append('700:' + i)

	for junkref in junklist:
	try:
	id_to_remove = idsdict[junkref]

	run_sql("delete from aidAUTHORNAMESBIBREFS where id=%s",
	(str(id_to_remove[0]),))
	if bconfig.TABLES_UTILS_DEBUG:
	print "authornames_tables_gc: idAUTHORNAMESBIBREFS deleting row " + str(id_to_remove)
	authrow = run_sql("select id,Name,bibrefs,db_name from aidAUTHORNAMES where id=%s", (str(id_to_remove[1]),))
	if len(authrow[0][2].split(',')) == 1:
	run_sql("delete from aidAUTHORNAMES where id=%s", (str(id_to_remove[1]),))
	if bconfig.TABLES_UTILS_DEBUG:
	print "authornames_tables_gc: aidAUTHORNAMES deleting " + str(authrow)
	else:
	bibreflist = ''
	for ref in authrow[0][2].split(','):
	if ref != junkref:
	bibreflist += ref + ','
	bibreflist = bibreflist[0:len(bibreflist) - 1]
	run_sql("update aidAUTHORNAMES set bibrefs=%s where id=%s",
	(bibreflist, id_to_remove[1]))
	if bconfig.TABLES_UTILS_DEBUG:
	print "authornames_tables_gc: aidAUTHORNAMES updating " + str(authrow) + ' with ' + str(bibreflist)
	except:
	pass


	abfs_ids_bunch = run_sql("select id,Name_id,bibref from aidAUTHORNAMESBIBREFS limit %s,%s" ,
	(str(bunch_start - 1), str(bunch_size)))
	bunch_start += bunch_size

	def update_authornames_tables_from_paper(papers_list=[]):
	"""
	Updates the authornames tables with the names on the given papers list
	@param papers_list: list of the papers which have been updated (bibrecs) ((1,),)

	For each paper of the list gathers all names, bibrefs and bibrecs to be added to aidAUTHORNAMES
	table, taking care of updating aidA.B. as well

	NOTE: update_authornames_tables_from_paper: just to remember: get record would be faster but
	we don't have the bibref there,
	maybe there is a way to rethink everything not to use bibrefs? How to address
	authors then?
	"""

	def update_authornames_tables(name, bibref):
	'''
	Update the tables for one bibref,name touple
	'''
	authornames_row = run_sql("select id,Name,bibrefs,db_name from aidAUTHORNAMES where db_name like %s",
	(str(name),))
	authornames_bibrefs_row = run_sql("select id,Name_id,bibref from aidAUTHORNAMESBIBREFS "
	"where bibref like %s", (str(bibref),))

	#@XXX: update_authornames_tables: if i'm not wrong there should always be only one result; will be checked further on
	if ((len(authornames_row) > 1) or (len(authornames_bibrefs_row) > 1) or
	(len(authornames_row) < len(authornames_bibrefs_row))):
	if bconfig.TABLES_UTILS_DEBUG:
	print "update_authornames_tables: More then one result or missing authornames?? Something is wrong, not updating" + str(authornames_row) + str(authornames_bibrefs_row)
	return

	if len(authornames_row) == 1:
	# we have an hit for the name string; check if there is the 'new' bibref associated,
	# if yes there is nothing to do, otherwise shold add it here and in the ANbibrefs table
	if authornames_row[0][2].count(bibref) < 1:
	if bconfig.TABLES_UTILS_DEBUG:
	print 'update_authornames_tables: Adding new bibref to ' + str(authornames_row) + ' ' + str(name) + ' ' + str(bibref)
	run_sql("update aidAUTHORNAMES set bibrefs=%s where id=%s",
	(authornames_row[0][2] + ',' + str(bibref), authornames_row[0][0]))
	if len(authornames_bibrefs_row) < 1:
	# we have to add the bibref to the name, would be strange if it would be already there
	run_sql("insert into aidAUTHORNAMESBIBREFS (Name_id,bibref) values (%s,%s)",
	(authornames_row[0][0], str(bibref)))
	else:
	if bconfig.TABLES_UTILS_DEBUG:
	print 'update_authornames_tables: Nothing to add to ' + str(authornames_row) + ' ' + str(name) + ' ' + str(bibref)


	else:
	#@NOTE: update_authornames_tables: we don't have the name string in the db: the name associated to the bibref is changed
	# or this is a new name? Experimenting with bibulpload looks like if a name on a paper changes a new bibref is created;
	#
	if len(authornames_bibrefs_row) == 1:
	# If len(authornames_row) is zero but we have a row in authornames_bibrefs_row means that
	# the name string is changed, somehow!
	# @FIXME: update_authornames_tables: this case should really be considered?
	if bconfig.TABLES_UTILS_DEBUG:
	print 'update_authornames_tables: The name associated to the bibref is changed?? ' + str(name) + ' ' + str(bibref)

	else:
	artifact_removal = re.compile("[^a-zA-Z0-9]")
	raw_name = artifact_removal.sub("", name)
	if len(raw_name) > 1:
	dbname = name
	else:
	dbname = 'Error in name parsing!'

	clean_name = create_normalized_name(split_name_parts(name))
	authornamesid = run_sql("insert into aidAUTHORNAMES (Name,bibrefs,db_name) values (%s,%s,%s)",
	(clean_name, str(bibref), dbname))
	run_sql("insert into aidAUTHORNAMESBIBREFS (Name_id,bibref) values (%s,%s)",
	(authornamesid, str(bibref)))
	if bconfig.TABLES_UTILS_DEBUG:
	print 'update_authornames_tables: Created new name ' + str(authornamesid) + ' ' + str(name) + ' ' + str(bibref)

	tables = [['bibrec_bib10x', 'bib10x', '100__a', '100'], ['bibrec_bib70x', 'bib70x', '700__a', '700']]
	for paper in papers_list:
	for table in tables:
	sqlstr = "select id_bibxxx from %s where id_bibrec=" % table[0]
	bibrefs = run_sql(sqlstr+"%s", (str(paper[0]),))
	for ref in bibrefs:
	sqlstr = "select value from %s where tag='%s' and id=" % (table[1], table[2])
	name = run_sql(sqlstr+"%s", (str(ref[0]),))
	if len(name) >= 1:
	update_authornames_tables(name[0][0], table[3] + ':' + str(ref[0]))

	def populate_authornames():
	"""
	Author names table population from bib10x and bib70x
	Average Runtime: 376.61 sec (6.27 min) for 327k entries
	Should be called only with empty table, then use
	update_authornames_tables_from_paper with the new papers which
	are coming in or modified.
	"""
	max_rows_per_run = bconfig.TABLE_POPULATION_BUNCH_SIZE

	if max_rows_per_run == -1:
	max_rows_per_run = 5000

	max100 = run_sql("SELECT COUNT(id) FROM bib10x WHERE tag = '100__a'")
	max700 = run_sql("SELECT COUNT(id) FROM bib70x WHERE tag = '700__a'")

	tables = "bib10x", "bib70x"
	authornames_is_empty_checked = 0
	authornames_is_empty = 1

	# Bring author names from bib10x and bib70x to authornames table

	for table in tables:

	if table == "bib10x":
	table_number = "100"
	else:
	table_number = "700"

	querylimiter_start = 0
	querylimiter_max = eval('max' + str(table_number) + '[0][0]')
	if bconfig.TABLES_UTILS_DEBUG:
	print "\nProcessing %s (%s entries):" % (table, querylimiter_max)
	sys.stdout.write("0% ")
	sys.stdout.flush()

	while querylimiter_start <= querylimiter_max:

	sys.stdout.write(".")
	sys.stdout.flush()
	percentage = int(((querylimiter_start + max_rows_per_run) * 100)
	/ querylimiter_max)
	sys.stdout.write(".%s%%." % (percentage))
	sys.stdout.flush()

	# Query the Database for a list of authors from the correspondent
	# tables--several thousands at a time
	bib = run_sql("SELECT id, value FROM %s WHERE tag = '%s__a' "
	"LIMIT %s, %s" % (table, table_number,
	querylimiter_start, max_rows_per_run))

	authorexists = None

	querylimiter_start += max_rows_per_run

	for i in bib:

	# For mental sanity, exclude things that are not names...
	# Yes, I know that there are strange names out there!
	# Yes, I read the 40 misconceptions about names.
	# Yes, I know!
	# However, these statistical outlaws are harmful.
	artifact_removal = re.compile("[^a-zA-Z0-9]")
	authorname = ""

	if not i[1]:
	continue

	raw_name = artifact_removal.sub("", i[1])

	if len(raw_name) > 1:
	authorname = i[1]

	if not authorname:
	continue

	if not authornames_is_empty_checked:
	authornames_is_empty = run_sql("SELECT COUNT(id) "
	"FROM aidAUTHORNAMES")
	if authornames_is_empty[0][0] == 0:
	authornames_is_empty_checked = 1
	authornames_is_empty = 1

	if not authornames_is_empty:
	# Find duplicates in the database and append id if
	# duplicate is found
	authorexists = run_sql("SELECT id,Name,bibrefs,db_name FROM aidAUTHORNAMES "
	"WHERE db_name = %s", (authorname,))

	bibrefs = "%s:%s" % (table_number, i[0])

	if not authorexists:
	insert_name = ""

	if len(authorname) > 240:
	bconfig.LOGGER.warn("\nName too long, truncated to 240"
	" chars: %s" % (authorname))
	insert_name = authorname[0:254]
	else:
	insert_name = authorname

	run_sql("INSERT INTO aidAUTHORNAMES VALUES"
	" (NULL, %s, %s, %s)",
	(create_normalized_name(
	split_name_parts(insert_name)),
	bibrefs, insert_name))

	if authornames_is_empty:
	authornames_is_empty = 0
	else:
	if authorexists[0][2].count(bibrefs) >= 0:
	upd_bibrefs = "%s,%s" % (authorexists[0][2], bibrefs)
	run_sql("UPDATE aidAUTHORNAMES SET bibrefs = "
	"%s WHERE id = %s",
	(upd_bibrefs, authorexists[0][0]))
	if bconfig.TABLES_UTILS_DEBUG:
	sys.stdout.write(" Done.")
	sys.stdout.flush()


	def get_diff_marc10x70x_to_anames():
	'''
	Determines the difference between the union of bib10x and bib70x and the
	aidAUTHORNAMES table.
	It will return the entries which are present in bib10x and bib70x but not
	in aidAUTHORNAMES. Meant to be run periodically.

	@todo: get_diff_marc10x70x_to_anames: find meaningful use for the
	returned results.
	@return: a list of the author names not contained in the authornames table
	@rtype: list
	'''
	run_sql("DROP VIEW authors")
	run_sql("create view authors AS \
	(SELECT value FROM bib10x WHERE tag =\"100__a\") \
	UNION \
	(SELECT value FROM bib70x WHERE tag =\"700__a\")")
	diff = run_sql("SELECT value from authors LEFT JOIN aidAUTHORNAMES as b"
	+ " ON (authors.value = b.Name) WHERE b.name IS NULL")
	return diff


	def populate_doclist_for_author_surname(surname):
	"""
	Searches for all the documents containing a given surname and processes
	them: creates the virtual author for each author on a document.

	@param surname: The search is based on this last name.
	@type surname: string
	"""
	if not dat.CITES_DICT:
	cites = get_citation_dict("citationdict")

	for key in cites:
	dat.CITES_DICT[key] = cites[key]

	if not dat.CITED_BY_DICT:
	cited_by = get_citation_dict("reversedict")

	for key in cited_by:
	dat.CITED_BY_DICT[key] = cited_by[key]

	bconfig.LOGGER.log(25, "Populating document list for %s" % (surname))

	init_authornames(surname)

	authors = [row for row in dat.AUTHOR_NAMES if not row['processed']]

	for author in authors:
	marc_100 = []
	marc_700 = []
	temp_marc = author['bibrefs'].split(',')

	for j in temp_marc:
	marcfield, internalid = j.split(':')

	if marcfield == '100':
	marc_100.append(internalid)
	elif marcfield == '700':
	marc_700.append(internalid)
	else:
	bconfig.LOGGER.error("Wrong MARC field. How did you do"
	" that?!--This should never happen! boo!")
	bibrecs = []

	if marc_100:
	bibrecs_100 = run_sql("SELECT id_bibrec FROM bibrec_bib10x"
	+ " WHERE id_bibxxx = %s"
	% (" OR id_bibxxx = ".join(marc_100)))

	for j in bibrecs_100:
	bibrecs.append(j[0])

	if marc_700:
	bibrecs_700 = run_sql("SELECT id_bibrec FROM bibrec_bib70x"
	+ " WHERE id_bibxxx = %s"
	% (" OR id_bibxxx = ".join(marc_700)))

	for j in bibrecs_700:
	bibrecs.append(j[0])

	if load_records_to_mem_cache(bibrecs):
	for bibrec in bibrecs:
	update_doclist(bibrec, authorname_id=author['id'])


	def load_records_to_mem_cache(bibrec_ids):
	'''
	Loads all the records specified in the list into the memory storage
	facility. It will try to attach citation information to each record in
	the process.

	@param bibrec_ids: list of bibrec IDs to load to memory
	@type bibrec_ids: list

	@return: Success (True) or failure (False) of the process
	@rtype: boolean
	'''
	if not bibrec_ids:
	return False

	for bibrec in bibrec_ids:
	if not bibrec in dat.RELEVANT_RECORDS:
	rec = get_record(bibrec)

	if bconfig.LIMIT_AUTHORS_PER_DOCUMENT:
	is_collaboration = False
	authors = 0

	try:
	for field in rec['710'][0][0]:
	if field[0] == 'g':
	is_collaboration = True
	break
	except KeyError:
	pass

	if is_collaboration:
	# If experimentalists shall be excluded uncomment
	# the following line
	#continue
	pass
	else:
	try:
	for field in rec['100'][0][0]:
	if field[0] == 'a':
	authors += 1
	break
	except KeyError:
	pass

	try:
	for coauthor in rec['700']:
	if coauthor[0][0][0] == 'a':
	authors += 1
	except KeyError:
	pass

	if authors > bconfig.MAX_AUTHORS_PER_DOCUMENT:
	continue

	dat.RELEVANT_RECORDS[bibrec] = rec
	cites = []
	cited_by = []

	try:
	cites = dat.CITES_DICT[bibrec]
	except KeyError:
	pass

	try:
	cited_by = dat.CITED_BY_DICT[bibrec]
	except KeyError:
	pass

	dat.RELEVANT_RECORDS[bibrec]['cites'] = cites
	dat.RELEVANT_RECORDS[bibrec]['cited_by'] = cited_by

	return True


	def init_authornames(surname):
	'''
	Initializes the AUTHOR_NAMES memory storage

	@param surname: The surname to search for
	@type surname: string
	'''

	if len(dat.AUTHOR_NAMES) > 0:
	existing = [row for row in dat.AUTHOR_NAMES
	if row['name'].split(",")[0] == surname]

	if existing:
	bconfig.LOGGER.log(25, "AUTHOR_NAMES already holds the "
	"correct data.")
	else:
	bconfig.LOGGER.debug("AUTHOR_NAMES will have additional content")
	for updated in [row for row in dat.AUTHOR_NAMES
	if not row['processed']]:
	updated['processed'] = True
	_perform_authornames_init(surname)
	else:
	_perform_authornames_init(surname)


	def _perform_authornames_init(surname):
	'''
	Performs the actual AUTHOR_NAMES memory storage init by reading values
	from the database

	@param surname: The surname to search for
	@type surname: string
	'''

	# instead of replacing with ' ', this will construct the regex for the
	# SQL query as well as the next if statement.
	surname = clean_name_string(surname,
	replacement=".{0,3}",
	keep_whitespace=False)

	if not surname.startswith(".{0,3}"):
	surname = "^['`-]*%s" % (surname)

	sql_query = ("SELECT id, name, bibrefs, db_name "
	"FROM aidAUTHORNAMES WHERE name REGEXP \"%s\""
	% (surname))

	for author in run_sql(sql_query):
	dat.AUTHOR_NAMES.append({'id': author[0],
	'name': author[1],
	'bibrefs': author[2],
	'db_name': author[3],
	'processed': False})


	def find_all_last_names():
	'''
	Filters out all last names from all names in the database.

	@return: a list of last names
	@rtype: list of strings
	'''
	all_names = run_sql("SELECT Name FROM aidAUTHORNAMES")
	last_names = set()


	for name in all_names:
	last_name = split_name_parts(name[0])[0]

	# For mental sanity, exclude things that are not names...
	# - Single letter names
	# - Single number names
	# - Names containing only numbers and/or symbols
	# Yes, I know that there are strange names out there!
	# Yes, I read the 40 misconceptions about names.
	# Yes, I know!
	# However, these statistical outlaws are harmful to the data set.
	artifact_removal = re.compile("[^a-zA-Z0-9]")
	last_name_test = artifact_removal.sub("", last_name)

	if len(last_name_test) > 1:
	last_names.add("%s," % (last_name,))

	# for name in all_names:
	# last_names.add([split_name_parts(name[0]), name[0]])

	return list(last_names)


	def write_mem_cache_to_tables(sanity_checks=False):
	'''
	Reads every memory cache and writes its contents to the appropriate
	table in the database.

	@param sanity_checks: Perform sanity checks before inserting (i.e. is the
	data already present in the db?) and after the insertion (i.e. is the
	data entered correctly?)
	@type sanity_checks: boolean
	'''
	ra_id_offset = run_sql("SELECT max(realauthorID) FROM"
	+ " aidREALAUTHORS")[0][0]
	va_id_offset = run_sql("SELECT max(virtualauthorID) FROM"
	+ " aidVIRTUALAUTHORS")[0][0]
	cluster_id_offset = run_sql("SELECT max(id) FROM"
	" aidVIRTUALAUTHORSCLUSTERS")[0][0]

	if not ra_id_offset:
	ra_id_offset = 0

	if not va_id_offset:
	va_id_offset = 0

	if not cluster_id_offset:
	cluster_id_offset = 0

	max_va_id = dat.ID_TRACKER["va_id_counter"]
	if max_va_id <= 1:
	max_va_id = 2
	random_va_id = random.randint(1, max_va_id - 1)
	va_mem_data = [row['value'] for row in dat.VIRTUALAUTHOR_DATA
	if (row["virtualauthorid"] == random_va_id
	and row['tag'] == "orig_authorname_id")][0]

	if sanity_checks:
	if va_mem_data:
	check_on_va = run_sql("SELECT id,virtualauthorID,tag,value FROM aidVIRTUALAUTHORSDATA "
	"WHERE tag='orig_authorname_id' AND "
	"value=%s" , (va_mem_data,))

	if check_on_va:
	bconfig.LOGGER.error("Sanity check reported that the data "
	"exists. We'll skip this record for now. "
	"Please check the data set manually.")
	return False

	bconfig.LOGGER.log(25, "Writing to persistence layer")
	bconfig.LOGGER.log(25, "Offsets...RA: %s; VA: %s; CL: %s" % (ra_id_offset,
	va_id_offset,
	cluster_id_offset))

	batch_max = bconfig.TABLE_POPULATION_BUNCH_SIZE

	query = []

	for va_cluster in dat.VIRTUALAUTHOR_CLUSTERS:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"cluster table failed")
	return False

	query = []

	query.append("INSERT INTO aidVIRTUALAUTHORSCLUSTERS (cluster_name) "
	"VALUES (\"%s\"); "
	% (va_cluster['clustername'],))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"cluster table failed")
	return False

	query = []

	for va_data in dat.VIRTUALAUTHOR_DATA:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"data table failed")
	return False

	query = []

	query.append("INSERT INTO aidVIRTUALAUTHORSDATA "
	"(virtualauthorID, tag, value) VALUES "
	"(%d, \"%s\", \"%s\"); "
	% (va_data['virtualauthorid'] + va_id_offset,
	va_data['tag'], va_data['value']))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"data table failed")
	return False

	query = []

	for va_entry in dat.VIRTUALAUTHORS:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"table failed")
	return False

	query = []

	query.append("INSERT INTO aidVIRTUALAUTHORS "
	"(virtualauthorID, authornamesID, p, clusterID) VALUES "
	"(%d, %d, \"%s\", %d); "
	% (va_entry['virtualauthorid'] + va_id_offset,
	va_entry['authornamesid'], va_entry['p'],
	va_entry['clusterid'] + cluster_id_offset))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"table failed")
	return False
	query = []

	for ra_data in dat.REALAUTHOR_DATA:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into real author "
	"data table failed")
	return False

	query = []

	if not ra_data['tag'] == 'outgoing_citation':
	query.append("INSERT INTO aidREALAUTHORDATA "
	"(realauthorID, tag, value, va_count, "
	"va_names_p, va_p) VALUES "
	"(%d, \"%s\", \"%s\", %d, "
	"%f, %f); "
	% (ra_data['realauthorid'] + ra_id_offset,
	ra_data['tag'], ra_data['value'],
	ra_data['va_count'], ra_data['va_np'],
	ra_data['va_p']))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into real author "
	"data table failed")
	return False
	query = []

	for ra_entry in dat.REALAUTHORS:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into real author "
	"table failed")
	return False
	query = []

	query.append("INSERT INTO aidREALAUTHORS "
	"(realauthorID, virtualauthorID, p) VALUES "
	"(%d, %d, %f); "
	% (ra_entry['realauthorid'] + ra_id_offset,
	ra_entry['virtualauthorid'] + va_id_offset,
	ra_entry['p']))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into real author "
	"table failed")
	return False
	query = []

	for doc in dat.DOC_LIST:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into doc list "
	"table failed")
	return False
	query = []

	for processed_author in doc['authornameids']:
	query.append("INSERT INTO aidDOCLIST "
	"(bibrecID, processed_author) VALUES "
	"(%d, %d); "
	% (doc['bibrecid'], processed_author))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into doc list "
	"table failed")
	return False
	query = []

	if sanity_checks:
	if va_mem_data:
	check_on_va = run_sql("SELECT id,virtualauthorID,tag,value FROM aidVIRTUALAUTHORSDATA "
	"WHERE tag='orig_authorname_id' AND "
	"value=%s" , (va_mem_data,))

	if not check_on_va:
	bconfig.LOGGER.error("Sanity check reported that no data "
	" exists in the database after writing "
	" to it.")
	return False

	bconfig.LOGGER.log(25, "Everything is now written to the database. "
	"Thanks. Bye.")

	return True


	def get_existing_last_names():
	'''
	Find all authors that have been processed and written to the database.
	Extract all last names from this list and return these last names.
	Especially helpful to exclude these clusters (last names) from a run.

	@return: list of last names
	@rtype: list of strings
	'''
	bconfig.LOGGER.log(25, "Reading info about existing authors from database")
	db_lnames = set()
	db_names = run_sql("select value from aidVIRTUALAUTHORSDATA where"
	+ " tag='orig_name_string'")

	for i in db_names:
	db_lnames.add(i[0].split(',')[0])

	return list(db_lnames)


	def get_len_authornames_bibrefs():
	'''
	Reads the lengths of authornames and bibrefs.
	Used to determine if esstential tables already exist.

	@return: dict({'names': -1, 'bibrefs': -1})
	@rtype: dict
	'''
	lengths = {'names':-1,
	'bibrefs':-1}

	if check_and_create_aid_tables():
	authornames_len = run_sql("SELECT count(id) from aidAUTHORNAMES")
	bibrefs_len = run_sql("SELECT count(id) from aidAUTHORNAMESBIBREFS")

	try:
	lengths['names'] = int(authornames_len[0][0])
	lengths['bibrefs'] = int(bibrefs_len[0][0])
	except (ValueError, TypeError):
	lengths['names'] = -1
	lengths['bibrefs'] = -1

	return lengths


	def check_and_create_aid_tables():
	'''
	Checks if the database tables for Bibauthorid exist. If not, creates them

	@return: True if tables exist, False if there was an error
	@rtype: boolean
	'''

	try:
	if not run_sql("show tables like 'aidAUTHORNAMES';"):
	return False
	except (ProgrammingError, OperationalError):
	return False

	return True


	def load_mem_cache_from_tables():
	'''
	Loads database content for an author's last name cluster into the
	memory storage facility.

	@precondition: memory storage facility needs to be loaded with respective
	authornames data (init_authornames(lastname))

	@return: Success (True) or failure (False) of the loading process
	@rtype: boolean
	'''
	# print "check for authornames mem table"

	if not dat.AUTHOR_NAMES:
	return False

	authornames_ids = [row['id'] for row in dat.AUTHOR_NAMES]

	if not authornames_ids:
	return False

	# print "Building offsets"
	ra_id_offset = run_sql("SELECT max(realauthorID) FROM"
	" aidREALAUTHORS")[0][0]
	va_id_offset = run_sql("SELECT max(virtualauthorID) FROM"
	" aidVIRTUALAUTHORS")[0][0]
	cluster_id_offset = run_sql("SELECT max(id) FROM"
	" aidVIRTUALAUTHORSCLUSTERS")[0][0]
	dat.set_tracker("raid_counter", ra_id_offset + 1)
	dat.set_tracker("va_id_counter", va_id_offset + 1)
	dat.set_tracker("cluster_id", cluster_id_offset + 1)
	# print "working on authornames ids..."

	for authornames_id in authornames_ids:
	db_vas = run_sql("SELECT virtualauthorid, authornamesid, p, clusterid "
	"from aidVIRTUALAUTHORS WHERE authornamesid = %s",
	(authornames_id,))
	# print "loading VAs for authid %s" % authornames_id
	db_vas_set = set([row[0] for row in db_vas])

	if not db_vas_set:
	db_vas_set = (-1, -1)
	else:
	db_vas_set.add(-1)

	db_vas_tuple = tuple(db_vas_set)
	db_ras = run_sql("SELECT realauthorid FROM "
	"aidREALAUTHORS WHERE virtualauthorid in %s"
	, (tuple(db_vas_tuple),))

	if db_ras:
	db_ras_set = set([row[0] for row in db_ras])
	db_ras_set.add(-1)
	db_ras_tuple = tuple(db_ras_set)
	db_ra_vas = run_sql("SELECT virtualauthorid FROM aidREALAUTHORS "
	"WHERE realauthorid in %s", (db_ras_tuple,))
	db_ra_vas_set = set([row[0] for row in db_ra_vas])
	db_ra_vas_set.add(-1)
	db_ras_tuple = tuple(db_ra_vas_set)
	db_vas_all = run_sql("SELECT virtualauthorid, authornamesid, p, "
	"clusterid FROM aidVIRTUALAUTHORS WHERE "
	"virtualauthorid in %s",
	(db_ras_tuple,))
	else:
	db_vas_all = db_vas

	for db_va in db_vas_all:
	dat.VIRTUALAUTHORS.append({'virtualauthorid': db_va[0],
	'authornamesid': db_va[1],
	'p': db_va[2],
	'clusterid': db_va[3]})

	if not dat.VIRTUALAUTHORS:
	# print "No Virtual Authors loaded. None created before."
	return True

	# print "Loading clusters"
	cluster_ids = set([row['clusterid'] for row in dat.VIRTUALAUTHORS])

	if not cluster_ids:
	cluster_ids = (-1, -1)
	else:
	cluster_ids.add(-1)

	db_va_clusters = run_sql("SELECT id, cluster_name FROM "
	"aidVIRTUALAUTHORSCLUSTERS WHERE id in %s"
	, (tuple(cluster_ids),))
	# print "Storing clusters"

	for db_va_cluster in db_va_clusters:
	dat.VIRTUALAUTHOR_CLUSTERS.append({'clusterid': db_va_cluster[0],
	'clustername': db_va_cluster[1]})

	# print "Loading VA data"
	va_ids = set([row['virtualauthorid'] for row in dat.VIRTUALAUTHORS])

	if not va_ids:
	va_ids = (-1, -1)
	else:
	va_ids.add(-1)

	# print "Storing VA data"
	db_va_data = run_sql("SELECT virtualauthorid, tag, value FROM "
	"aidVIRTUALAUTHORSDATA WHERE virtualauthorid in %s"
	, (tuple(va_ids),))

	for db_va_dat in db_va_data:
	dat.VIRTUALAUTHOR_DATA.append({'virtualauthorid' : db_va_dat[0],
	'tag': db_va_dat[1],
	'value': db_va_dat[2]})

	# print "Loading RAs"
	db_ras = run_sql("SELECT realauthorid, virtualauthorid, p FROM "
	"aidREALAUTHORS WHERE virtualauthorid in %s"
	, (tuple(va_ids),))
	# print "Storing RAs"

	for db_ra in db_ras:
	dat.REALAUTHORS.append({'realauthorid': db_ra[0],
	'virtualauthorid': db_ra[1],
	'p': db_ra[2]})

	# print "Loading RA data"
	ra_ids = set([row['realauthorid'] for row in dat.REALAUTHORS])

	if not ra_ids:
	ra_ids = (-1, -1)
	else:
	ra_ids.add(-1)

	db_ra_data = run_sql("SELECT realauthorid, tag, value, va_count, "
	"va_names_p, va_p FROM aidREALAUTHORDATA WHERE "
	"realauthorid in %s", (tuple(ra_ids),))
	# print "Storing RA data"

	for db_ra_dat in db_ra_data:
	dat.REALAUTHOR_DATA.append({'realauthorid': db_ra_dat[0],
	'tag': db_ra_dat[1],
	'value': db_ra_dat[2],
	'va_count': db_ra_dat[3],
	'va_np': db_ra_dat[4],
	'va_p': db_ra_dat[5]})

	# print "Loading doclist entries"
	bibrec_ids = set([int(row['value']) for row in dat.REALAUTHOR_DATA
	if row['tag'] == "bibrec_id"])

	if not bibrec_ids:
	bibrec_ids = (-1, -1)
	else:
	bibrec_ids.add(-1)

	db_doclist = run_sql("SELECT bibrecid, processed_author FROM aidDOCLIST "
	"WHERE bibrecid in %s", (tuple(bibrec_ids),))
	# print "Storing doclist entries"

	for db_doc in db_doclist:
	existing_item = [row for row in dat.DOC_LIST
	if row['bibrecid'] == db_doc[0]]

	if existing_item:
	for update in [row for row in dat.DOC_LIST
	if row['bibrecid'] == db_doc[0]]:
	if not db_doc[1] in update['authornameids']:
	update['authornameids'].append(db_doc[1])
	else:
	dat.DOC_LIST.append({'bibrecid': db_doc[0],
	'authornameids': [db_doc[1]]})

	if set(bibrec_ids).remove(-1):
	# print "will load recs"

	if not load_records_to_mem_cache(list(bibrec_ids)):
	# print" FAILED loading records"
	return False

	return True


	def update_tables_from_mem_cache(sanity_checks=False, return_ra_updates=False):
	'''
	Updates the tables in the database with the information in the memory
	storage while taking into account only changed data to optimize the time
	needed for the update.

	@param sanity_checks: Perform sanity checks while updating--slows down the
	process but might detect mistakes and prevent harm. Default: False
	@type sanity_checks: boolean
	@param return_ra_updates: Will force the method to return a list of real
	author ids that have been updated. Default: False
	@type return_ra_updates: boolean

	@return: Either True if update went through without trouble or False if it
	did not and a list of updated real authors or an empty list
	@rtype: tuple of (boolean, list)
	'''
	del_ra_ids = set([-1])
	del_va_ids = dat.UPDATES_LOG['deleted_vas'].union(
	dat.UPDATES_LOG['touched_vas'])

	if del_va_ids:
	del_va_ids.add(-1)
	del_ra_ids_db = run_sql("SELECT realauthorid FROM aidREALAUTHORS "
	"WHERE virtualauthorid in %s"
	, (tuple(del_va_ids),))

	for ra_id in del_ra_ids_db:
	del_ra_ids.add(ra_id[0])

	if sanity_checks:
	va_count_db = run_sql("SELECT COUNT(DISTINCT virtualauthorid) "
	"FROM aidVIRTUALAUTHORS WHERE "
	"virtualauthorid in %s"
	, (tuple(del_va_ids),))

	try:
	va_count_db = int(va_count_db[0][0])
	except (ValueError, IndexError, TypeError):
	bconfig.LOGGER.exception("Error while reading number of "
	"virtual authors in database")
	va_count_db = -1

	if not (va_count_db == len(del_va_ids)):
	bconfig.LOGGER.error("Sanity checks reported that the number "
	"of virtual authors in the memory "
	"storage is not equal to the number of "
	"virtual authors in the database. "
	"Aborting update mission.")
	return (False, [])

	bconfig.LOGGER.log(25, "Removing updated entries from "
	"persistence layer")

	run_sql("DELETE FROM aidVIRTUALAUTHORSDATA "
	"WHERE virtualauthorid in %s", (tuple(del_va_ids),))

	run_sql("DELETE FROM aidVIRTUALAUTHORS "
	"WHERE virtualauthorid in %s", (tuple(del_va_ids),))

	if len(tuple(del_ra_ids)) > 1:
	run_sql("DELETE FROM aidREALAUTHORDATA "
	"WHERE realauthorid in %s", (tuple(del_ra_ids),))

	run_sql("DELETE FROM aidREALAUTHORS "
	"WHERE realauthorid in %s", (tuple(del_ra_ids),))

	insert_ra_ids = dat.UPDATES_LOG['new_ras'].union(del_ra_ids)
	insert_va_ids = dat.UPDATES_LOG['new_vas'].union(
	dat.UPDATES_LOG['touched_vas'])
	bconfig.LOGGER.log(25, "Writing to persistence layer")

	batch_max = bconfig.TABLE_POPULATION_BUNCH_SIZE

	ra_id_db_max = run_sql("SELECT max(realauthorID) FROM"
	" aidREALAUTHORS")[0][0]
	va_id_db_max = run_sql("SELECT max(virtualauthorID) FROM"
	" aidVIRTUALAUTHORS")[0][0]
	cluster_id_db_max = run_sql("SELECT max(id) FROM"
	" aidVIRTUALAUTHORSCLUSTERS")[0][0]

	if not ra_id_db_max or not va_id_db_max or not cluster_id_db_max:
	return (False, [])

	new_clusters = [row for row in dat.VIRTUALAUTHOR_CLUSTERS
	if row['clusterid'] > cluster_id_db_max]
	query = []

	if not insert_ra_ids or not insert_va_ids:
	bconfig.LOGGER.log(25, "Saving update to persistence layer finished "
	"with success! (There was nothing to do.)")
	return (True, [])

	for va_cluster in new_clusters:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"cluster table failed")
	return (False, [])

	query = []

	query.append("INSERT INTO aidVIRTUALAUTHORSCLUSTERS (cluster_name) "
	"VALUES (\"%s\"); "
	% (va_cluster['clustername'],))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"cluster table failed")
	return (False, [])

	query = []

	va_data_to_insert = [row for row in dat.VIRTUALAUTHOR_DATA
	if row['virtualauthorid'] in insert_va_ids]

	if sanity_checks:
	db_existing_va_ids = run_sql("SELECT COUNT(DISTINCT virtualauthorid) "
	"WHERE virtualauthorid in %s"
	, (tuple(insert_va_ids),))
	try:
	db_existing_va_ids = int(va_count_db[0][0])
	except (ValueError, IndexError, TypeError):
	bconfig.LOGGER.exception("Error while reading number of "
	"virtual authors in database")
	db_existing_va_ids = -1

	if not (db_existing_va_ids == 0):
	bconfig.LOGGER.error("Sanity checks reported that the "
	"virtual authors in the memory storage "
	"that shall be inserted already exist "
	"in the database. Aborting update mission.")
	return (False, [])

	for va_data in va_data_to_insert:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"data table failed")
	return (False, [])

	query = []

	query.append("INSERT INTO aidVIRTUALAUTHORSDATA "
	"(virtualauthorID, tag, value) VALUES "
	"(%d, \"%s\", \"%s\"); "
	% (va_data['virtualauthorid'],
	va_data['tag'], va_data['value']))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"data table failed")
	return (False, [])

	query = []

	vas_to_insert = [row for row in dat.VIRTUALAUTHORS
	if row['virtualauthorid'] in insert_va_ids]

	for va_entry in vas_to_insert:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"table failed")
	return (False, [])

	query = []

	query.append("INSERT INTO aidVIRTUALAUTHORS "
	"(virtualauthorID, authornamesID, p, clusterID) VALUES "
	"(%d, %d, \"%s\", %d); "
	% (va_entry['virtualauthorid'],
	va_entry['authornamesid'], va_entry['p'],
	va_entry['clusterid']))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into virtual author "
	"table failed")
	return (False, [])
	query = []

	if sanity_checks:
	db_existing_ra_ids = run_sql("SELECT COUNT(DISTINCT realauthorid) "
	"WHERE realauthorid in %s"
	, (tuple(insert_ra_ids),))
	try:
	db_existing_ra_ids = int(db_existing_ra_ids[0][0])
	except (ValueError, IndexError, TypeError):
	bconfig.LOGGER.exception("Error while reading number of "
	"real authors in database")
	db_existing_va_ids = -1

	if not (db_existing_ra_ids == 0):
	bconfig.LOGGER.error("Sanity checks reported that the "
	"real authors in the memory storage "
	"that shall be inserted already exist "
	"in the database. Aborting update mission.")
	return (False, [])

	ra_data_to_insert = [row for row in dat.REALAUTHOR_DATA
	if row['realauthorid'] in insert_ra_ids]

	for ra_data in ra_data_to_insert:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into real author "
	"data table failed")
	return (False, [])

	query = []

	if not ra_data['tag'] == 'outgoing_citation':
	query.append("INSERT INTO aidREALAUTHORDATA "
	"(realauthorID, tag, value, va_count, "
	"va_names_p, va_p) VALUES "
	"(%d, \"%s\", \"%s\", %d, "
	"%f, %f); "
	% (ra_data['realauthorid'],
	ra_data['tag'], ra_data['value'],
	ra_data['va_count'], ra_data['va_np'],
	ra_data['va_p']))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into real author "
	"data table failed")
	return (False, [])
	query = []

	ras_to_insert = [row for row in dat.REALAUTHORS
	if row['realauthorid'] in insert_ra_ids]


	for ra_entry in ras_to_insert:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into real author "
	"table failed")
	return (False, [])
	query = []

	query.append("INSERT INTO aidREALAUTHORS "
	"(realauthorID, virtualauthorID, p) VALUES "
	"(%d, %d, %f); "
	% (ra_entry['realauthorid'],
	ra_entry['virtualauthorid'],
	ra_entry['p']))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into real author "
	"table failed")
	return (False, [])
	query = []

	if sanity_checks:
	db_existing_ra_ids = run_sql("SELECT COUNT(DISTINCT realauthorid) "
	"WHERE realauthorid in %s"
	, (tuple(insert_ra_ids),))
	try:
	db_existing_ra_ids = int(db_existing_ra_ids[0][0])
	except (ValueError, IndexError, TypeError):
	bconfig.LOGGER.exception("Error while reading number of "
	"real authors in database")
	db_existing_ra_ids = -1

	if not (db_existing_ra_ids == len(insert_ra_ids)):
	bconfig.LOGGER.error("Sanity checks reported that the number of"
	"real authors in the memory storage "
	"that shall be inserted is not equal to "
	"the number of real authors now "
	"in the database. Aborting update mission.")
	return (False, [])

	recid_updates = dat.UPDATES_LOG["rec_updates"]

	if recid_updates:
	recid_updates.add(-1)
	run_sql("DELETE FROM aidDOCLIST WHERE bibrecid in %s"
	, (tuple(recid_updates),))

	doclist_insert = [row for row in dat.DOC_LIST
	if row['bibrecid'] in dat.UPDATES_LOG["rec_updates"]]

	for doc in doclist_insert:
	if len(query) >= batch_max:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into doc list "
	"table failed")
	return (False, [])
	query = []

	for processed_author in doc['authornameids']:
	query.append("INSERT INTO aidDOCLIST "
	"(bibrecID, processed_author) VALUES "
	"(%d, %d); "
	% (doc['bibrecid'], processed_author))

	if query:
	try:
	run_sql(''.join(query))
	except:
	bconfig.LOGGER.critical("Inserting into doc list "
	"table failed")
	return (False, [])
	query = []

	bconfig.LOGGER.log(25, "Saving update to persistence layer finished "
	"with success!")
	if return_ra_updates:
	ra_ids = [[row['realauthorid']] for row in ras_to_insert]
	return (True, ra_ids)
	else:
	return (True, [])

bibauthorid_tables_utils.pyNo OneTemporaryActions

File Metadata

bibauthorid_tables_utils.pyView Options

Event Timeline

bibauthorid_tables_utils.py
No OneTemporary
Actions

bibauthorid_tables_utils.py
View Options