engine_utils.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Nov 1, 21:38

engine_utils.py
View Options

	# -- coding: utf-8 --
	#
	# This file is part of Invenio.
	# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015 CERN.
	#
	# Invenio is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License as
	# published by the Free Software Foundation; either version 2 of the
	# License, or (at your option) any later version.
	#
	# Invenio is distributed in the hope that it will be useful, but
	# WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with Invenio; if not, write to the Free Software Foundation, Inc.,
	# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""bibindex.engine_utils: here are some useful regular experssions for tokenizers
	and several helper functions.
	"""


	import re
	import sys

	from invenio.base.helpers import utf8ifier

	from invenio.legacy.dbquery import run_sql, \
	DatabaseError
	from invenio.legacy.bibsched.bibtask import write_message
	from invenio.legacy.bibrecord import get_fieldvalues
	from invenio.config import \
	CFG_BIBINDEX_CHARS_PUNCTUATION, \
	CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS
	from invenio.legacy.bibindex.engine_config import CFG_BIBINDEX_COLUMN_VALUE_SEPARATOR


	latex_formula_re = re.compile(r'\$.?\$\|\\\[.?\\\]')
	phrase_delimiter_re = re.compile(r'[\.:;\?\!]')
	space_cleaner_re = re.compile(r'\s+')
	re_block_punctuation_begin = re.compile(r"^" + CFG_BIBINDEX_CHARS_PUNCTUATION + "+")
	re_block_punctuation_end = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION + "+$")
	re_punctuation = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION)
	re_separators = re.compile(CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS)
	re_arxiv = re.compile(r'^arxiv:\d\d\d\d\.\d\d\d\d')

	re_pattern_fuzzy_author_trigger = re.compile(r'[\s\,\.]')
	# FIXME: re_pattern_fuzzy_author_trigger could be removed and an
	# BibAuthorID API function could be called instead after we
	# double-check that there are no circular imports.


	def load_tokenizers():
	"""
	Load all the bibindex tokenizers and returns it.
	"""
	import warnings
	warnings.warn("The function is deprecated. Please use the "
	"`load_tokenizers()` from `invenio.modules.indexer.utils`",
	DeprecationWarning)
	from invenio.modules.indexer.registry import tokenizers
	return dict((module.__name__.split('.')[-1],
	getattr(module, module.__name__.split('.')[-1], ''))
	for module in tokenizers)

	def get_all_index_names_and_column_values(column_name):
	"""Returns a list of tuples of name and another column of all defined words indexes.
	Returns empty list in case there are no tags indexed in this index or in case
	the column name does not exist.
	Example: output=[('global', something), ('title', something)]."""
	out = []
	query = """SELECT name, %s FROM idxINDEX""" % column_name
	try:
	res = run_sql(query)
	for row in res:
	out.append((row[0], row[1]))
	except DatabaseError:
	write_message("Exception caught for SQL statement: %s; column %s might not exist" % (query, column_name), sys.stderr)
	return out


	def get_all_synonym_knowledge_bases():
	"""Returns a dictionary of name key and knowledge base name and match type tuple value
	information of all defined words indexes that have knowledge base information.
	Returns empty dictionary in case there are no tags indexed.
	Example: output['global'] = ('INDEX-SYNONYM-TITLE', 'exact'), output['title'] = ('INDEX-SYNONYM-TITLE', 'exact')."""
	res = get_all_index_names_and_column_values("synonym_kbrs")
	out = {}
	for row in res:
	kb_data = row[1]
	# ignore empty strings
	if len(kb_data):
	out[row[0]] = tuple(kb_data.split(CFG_BIBINDEX_COLUMN_VALUE_SEPARATOR))
	return out


	def get_index_remove_stopwords(index_id):
	"""Returns value of a remove_stopword field from idxINDEX database table
	if it's not 'No'. If it's 'No' returns False.
	Just for consistency with WordTable.
	@param index_id: id of the index
	"""
	try:
	result = run_sql("SELECT remove_stopwords FROM idxINDEX WHERE ID=%s", (index_id, ))[0][0]
	except:
	return False
	if result == 'No' or result == '':
	return False
	return result


	def get_index_remove_html_markup(index_id):
	""" Gets remove_html_markup parameter from database ('Yes' or 'No') and
	changes it to True, False.
	Just for consistency with WordTable."""
	try:
	result = run_sql("SELECT remove_html_markup FROM idxINDEX WHERE ID=%s", (index_id, ))[0][0]
	except:
	return False
	if result == 'Yes':
	return True
	return False


	def get_index_remove_latex_markup(index_id):
	""" Gets remove_latex_markup parameter from database ('Yes' or 'No') and
	changes it to True, False.
	Just for consistency with WordTable."""
	try:
	result = run_sql("SELECT remove_latex_markup FROM idxINDEX WHERE ID=%s", (index_id, ))[0][0]
	except:
	return False
	if result == 'Yes':
	return True
	return False


	def author_name_requires_phrase_search(p):
	"""
	Detect whether author query pattern p requires phrase search.
	Notably, look for presence of spaces and commas.
	"""
	if re_pattern_fuzzy_author_trigger.search(p):
	return True
	return False


	def get_field_count(recID, tags):
	"""
	Return number of field instances having TAGS in record RECID.

	@param recID: record ID
	@type recID: int
	@param tags: list of tags to count, e.g. ['100__a', '700__a']
	@type tags: list
	@return: number of tags present in record
	@rtype: int
	@note: Works internally via getting field values, which may not be
	very efficient. Could use counts only, or else retrieve stored
	recstruct format of the record and walk through it.
	"""
	out = 0
	for tag in tags:
	out += len(get_fieldvalues(recID, tag))
	return out


	def run_sql_drop_silently(query):
	"""
	SQL DROP statement with IF EXISTS part generates
	warning if table does not exist. To mute the warning
	we can remove IF EXISTS and catch SQL exception telling
	us that table does not exist.
	"""
	try:
	query = query.replace(" IF EXISTS", "")
	run_sql(query)
	except Exception as e:
	if str(e).find("Unknown table") > -1:
	pass
	else:
	raise e


	from invenio.modules.indexer.utils import get_idx_indexer


	def get_all_indexes(virtual=True, with_ids=False):
	"""Returns the list of the names of all defined words indexes.
	Returns empty list in case there are no tags indexed in this index.
	@param virtual: if True function will return also virtual indexes
	@param with_ids: if True function will return also IDs of found indexes
	Example: output=['global', 'author']."""
	out = []
	if virtual:
	query = """SELECT %s name FROM idxINDEX"""
	query = query % (with_ids and "id," or "")
	else:
	query = """SELECT %s w.name FROM idxINDEX AS w
	WHERE w.id NOT IN (SELECT DISTINCT id_virtual FROM idxINDEX_idxINDEX)"""
	query = query % (with_ids and "w.id," or "")
	res = run_sql(query)
	if with_ids:
	out = [row for row in res]
	else:
	out = [row[0] for row in res]
	return out


	def get_all_virtual_indexes():
	""" Returns all defined 'virtual' indexes. """
	query = """SELECT DISTINCT v.id_virtual, w.name FROM idxINDEX_idxINDEX AS v,
	idxINDEX AS w
	WHERE v.id_virtual=w.id"""
	res = run_sql(query)
	return res


	def get_index_virtual_indexes(index_id):
	"""Returns 'virtual' indexes that should be indexed together with
	given index."""
	query = """SELECT v.id_virtual, w.name FROM idxINDEX_idxINDEX AS v,
	idxINDEX AS w
	WHERE v.id_virtual=w.id AND
	v.id_normal=%s"""
	res = run_sql(query, (index_id,))
	return res


	def is_index_virtual(index_id):
	"""Checks if index is virtual"""
	query = """SELECT id_virtual FROM idxINDEX_idxINDEX
	WHERE id_virtual=%s"""
	res = run_sql(query, (index_id,))
	if res:
	return True
	return False

	def filter_for_virtual_indexes(index_list):
	"""
	Function removes all non-virtual indexes
	from given list of indexes.
	@param index_list: list of index names
	"""
	try:
	virtual = zip(*get_all_virtual_indexes())[1]
	selected = set(virtual) & set(index_list)
	return list(selected)
	except IndexError:
	return []
	return []

	def get_virtual_index_building_blocks(index_id):
	"""Returns indexes that made up virtual index of given index_id.
	If index_id is an id of normal index (not virtual) returns
	empty tuple.
	"""
	query = """SELECT v.id_normal, w.name FROM idxINDEX_idxINDEX AS v,
	idxINDEX AS w
	WHERE v.id_normal=w.id AND
	v.id_virtual=%s"""
	res = run_sql(query, (index_id,))
	return res


	def get_index_id_from_index_name(index_name):
	"""Returns the words/phrase index id for INDEXNAME.
	Returns empty string in case there is no words table for this index.
	Example: field='author', output=4."""
	out = 0
	query = """SELECT w.id FROM idxINDEX AS w
	WHERE w.name=%s LIMIT 1"""
	res = run_sql(query, (index_name,), 1)
	if res:
	out = res[0][0]
	return out


	def get_index_name_from_index_id(index_id):
	"""Returns the words/phrase index name for INDEXID.
	Returns '' in case there is no words table for this indexid.
	Example: field=9, output='fulltext'."""
	res = run_sql("SELECT name FROM idxINDEX WHERE id=%s", (index_id,))
	if res:
	return res[0][0]
	return ''


	def get_field_tags(field, tagtype="marc"):
	"""Returns a list of tags for the field code 'field'. Works
	for both MARC and nonMARC tags.
	Returns empty list in case of error.
	Example: field='author', output=['100__%','700__%'].
	@param tagtype: can be: "marc" or "nonmarc", default value
	is "marc" for backward compatibility
	"""
	out = []
	query = """SELECT t.%s FROM tag AS t,
	field_tag AS ft,
	field AS f
	WHERE f.code=%%s AND
	ft.id_field=f.id AND
	t.id=ft.id_tag
	ORDER BY ft.score DESC"""
	if tagtype == "marc":
	query = query % "value"
	res = run_sql(query, (field,))
	return [row[0] for row in res]
	else:
	query = query % "recjson_value"
	res = run_sql(query, (field,))
	values = []
	for row in res:
	if row[0] is not None:
	values.extend(row[0].split(","))
	return values


	def get_marc_tag_indexes(tag, virtual=True):
	"""Returns indexes names and ids corresponding to the given tag
	@param tag: MARC tag in one of the forms:
	'xx%', 'xxx', 'xxx__a', 'xxx__%'
	@param virtual: if True function will also return virtual indexes"""
	tag2 = tag[0:2] + "%" #for tags in the form: 10%
	tag3 = tag[:-1] + "%" #for tags in the form: 100__%
	query = """SELECT DISTINCT w.id,w.name FROM idxINDEX AS w,
	idxINDEX_field AS wf,
	field_tag AS ft,
	tag as t
	WHERE (t.value=%%s OR
	t.value=%%s OR
	%s) AND
	t.id=ft.id_tag AND
	ft.id_field=wf.id_field AND
	wf.id_idxINDEX=w.id"""
	if tag[-1] == "%":
	missing_piece = "t.value LIKE %s"
	elif tag[-1] != "%" and len(tag) == 3:
	missing_piece = "t.value LIKE %s"
	tag3 = tag + "%" #for all tags which start from 'tag'
	else:
	missing_piece = "t.value=%s"
	query = query % missing_piece
	res = run_sql(query, (tag, tag2, tag3))
	if res:
	if virtual:
	response = list(res)
	index_ids = map(str, zip(*res)[0])
	query = """SELECT DISTINCT v.id_virtual,w.name FROM idxINDEX_idxINDEX AS v,
	idxINDEX as w
	WHERE v.id_virtual=w.id AND
	v.id_normal IN ("""
	query = query + ", ".join(index_ids) + ")"
	response.extend(run_sql(query))
	return tuple(response)
	return res
	return ()


	def get_nonmarc_tag_indexes(nonmarc_tag, virtual=True):
	"""Returns index names and ids corresponding to the given nonmarc tag
	(nonmarc tag can be also called 'bibfield field').
	If param 'virtual' is set to True function will also return
	virtual indexes"""
	query = """SELECT DISTINCT w.id, w.name FROM idxINDEX AS w,
	idxINDEX_field AS wf,
	field_tag AS ft,
	tag as t
	WHERE (t.recjson_value LIKE %s OR
	t.recjson_value LIKE %s OR
	t.recjson_value LIKE %s OR
	t.recjson_value=%s) AND
	t.id=ft.id_tag AND
	ft.id_field=wf.id_field AND
	wf.id_idxINDEX=w.id"""

	at_the_begining = nonmarc_tag + ',%%'
	in_the_middle = '%%,' + nonmarc_tag + ',%%'
	at_the_end = '%%,' + nonmarc_tag

	res = run_sql(query, (at_the_begining, in_the_middle, at_the_end, nonmarc_tag))
	if res:
	if virtual:
	response = list(res)
	index_ids = map(str, zip(*res)[0])
	query = """SELECT DISTINCT v.id_virtual,w.name FROM idxINDEX_idxINDEX AS v,
	idxINDEX as w
	WHERE v.id_virtual=w.id AND
	v.id_normal IN ("""
	query = query + ", ".join(index_ids) + ")"
	response.extend(run_sql(query))
	return tuple(response)
	return res
	return ()


	def get_index_tags(indexname, virtual=True, tagtype="marc"):
	"""Returns the list of tags that are indexed inside INDEXNAME.
	Returns empty list in case there are no tags indexed in this index.
	Note: uses get_field_tags() defined before.
	Example: field='author', output=['100__%', '700__%'].
	@param tagtype: can be: "marc" or "nonmarc", default value
	is "marc" for backward compatibility
	"""
	out = []
	query = """SELECT f.code FROM idxINDEX AS w,
	idxINDEX_field AS wf,
	field AS f
	WHERE w.name=%s AND
	w.id=wf.id_idxINDEX AND
	f.id=wf.id_field"""
	res = run_sql(query, (indexname,))
	for row in res:
	out.extend(get_field_tags(row[0], tagtype))
	if not out and virtual:
	index_id = get_index_id_from_index_name(indexname)
	try:
	dependent_indexes = map(str, zip(*get_virtual_index_building_blocks(index_id))[0])
	except IndexError:
	return out
	tags = set()
	query = """SELECT DISTINCT f.code FROM idxINDEX AS w, idxINDEX_field AS wf, field AS f
	WHERE w.id=wf.id_idxINDEX AND
	f.id=wf.id_field AND
	w.id IN ("""
	query = query + ", ".join(dependent_indexes) + ")"
	res = run_sql(query)
	for row in res:
	tags \|= set(get_field_tags(row[0], tagtype))
	out = list(tags)
	out = [tag for tag in out if tag]
	return out


	def get_min_last_updated(indexes):
	"""Returns min modification date for 'indexes':
	min(last_updated)
	@param indexes: list of indexes
	"""
	query= """SELECT min(last_updated) FROM idxINDEX WHERE name IN ("""
	for index in indexes:
	query += "%s,"
	query = query[:-1] + ")"
	res = run_sql(query, tuple(indexes))
	return res


	def remove_inexistent_indexes(indexes, leave_virtual=False):
	"""Removes indexes that don't exist from the given list of indexes.
	@param indexes: list of indexes
	@param leave_virtual: should we leave virtual indexes in the list?
	"""
	correct_indexes = get_all_indexes(leave_virtual)
	cleaned = []
	for index in indexes:
	if index in correct_indexes:
	cleaned.append(index)
	return cleaned


	def get_records_range_for_index(index_id):
	"""
	Get records range for given index.
	"""
	try:
	query = """SELECT min(id_bibrec), max(id_bibrec) FROM idxWORD%02dR""" % index_id
	resp = run_sql(query)
	if resp:
	return resp[0]
	return None
	except Exception:
	return None


	def make_prefix(index_name):
	"""
	Creates a prefix for specific index which is added
	to every word from this index stored in reversed table
	of corresponding virtual index.
	@param index_name: name of the dependent index we want to create prefix for
	"""
	return "__" + index_name + "__"


	class UnknownTokenizer(Exception):
	pass


	def list_union(list1, list2):
	"Returns union of the two lists."
	union_dict = {}
	for e in list1:
	union_dict[e] = 1
	for e in list2:
	union_dict[e] = 1
	return union_dict.keys()


	def get_index_fields(index_id):
	"""Returns fields that are connected to index specified by
	index_id.
	"""
	query = """SELECT f.id, f.name FROM field as f,
	idxINDEX as w,
	idxINDEX_field as wf
	WHERE f.id=wf.id_field AND
	wf.id_idxINDEX=w.id AND
	w.id=%s
	"""
	index_fields = run_sql(query, (index_id, ) )
	return index_fields


	def recognize_marc_tag(tag):
	"""Checks if tag is a MARC tag or not"""
	tag_len = len(tag)
	if 3 <= tag_len <= 6 and tag[0:3].isdigit():
	return True
	if tag_len == 3 and tag[0:2].isdigit() and tag[2] == '%':
	return True
	return False


	def _is_collection(subfield):
	"""Checks if a type is a collection;
	get_values_recursively internal function."""
	return hasattr(subfield, '__iter__')


	def _get_values(subfield):
	"""Returns values of a subfield suitable for later tokenizing;
	get_values_recursively internal function."""
	if type(subfield) == dict:
	return subfield.values()
	else:
	return subfield


	def get_values_recursively(subfield, phrases):
	"""Finds all values suitable for later tokenizing in
	field/subfield of bibfield record.
	@param subfield: name of the field/subfield
	@param phrases: container for phrases (for example empty list)

	FIXME: move this function to bibfield!
	As soon as possible. Note that journal tokenizer
	also needs to be changed.
	"""
	if _is_collection(subfield):
	for s in _get_values(subfield):
	get_values_recursively(s, phrases)
	elif subfield is not None:
	phrases.append(utf8ifier(subfield))

engine_utils.pyNo OneTemporaryActions

File Metadata

engine_utils.pyView Options

Event Timeline

engine_utils.py
No OneTemporary
Actions

engine_utils.py
View Options