termcollectors.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Jul 28, 03:29

termcollectors.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2014 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""
	BibIndex termcollectors.
	"""

	import fnmatch

	from invenio.legacy.bibindex.engine_utils import list_union, \
	UnknownTokenizer, \
	get_values_recursively
	from invenio.legacy.bibindex.engine_config import CFG_BIBINDEX_TOKENIZER_TYPE
	from invenio.legacy.dbquery import run_sql
	from invenio.legacy.bibdocfile.api import BibRecDocs
	from invenio.legacy.search_engine.utils import get_fieldvalues

	from invenio.legacy.bibauthority.engine import get_index_strings_by_control_no
	from invenio.legacy.bibauthority.config import \
	CFG_BIBAUTHORITY_CONTROLLED_FIELDS_BIBLIOGRAPHIC
	from invenio.modules.records.api import get_record

	class TermCollector(object):
	"""
	Objects of this class take care of collecting phrases from
	records metadata and tokenizing them in order to get
	termslists which can be store in databse.

	They collect terms from MARC tags ONLY.
	Please don't use it for other standards, of course
	you can, but it won't work. Use NonmarcTermCollector instead.
	"""
	def __init__(self, tokenizer,
	tokenizer_type,
	table_type,
	tags,
	recIDs_range):
	self.table_type = table_type
	self.tokenizer = tokenizer
	self.tokenizer_type = tokenizer_type
	self.tokenizing_function = \
	self.tokenizer.get_tokenizing_function(table_type)
	self.tags = tags
	self.special_tags = {}
	self.first_recID = recIDs_range[0]
	self.last_recID = recIDs_range[1]

	if self.tokenizer_type not in CFG_BIBINDEX_TOKENIZER_TYPE.values():
	raise UnknownTokenizer("Tokenizer has not been recognized: %s" \
	% self.tokenizer.__class__.__name__)

	def set_special_tags(self, special_tags):
	"""
	Adds special tags for further use.
	"""
	self.special_tags = special_tags

	def collect(self, recIDs, termslist={}):
	"""
	Finds terms and tokenizes them in order to obtain termslist.
	@param recIDs: records to index in form: [rid1, rid2, rid3 ...]
	@param termslist: dictionary for results
	"""
	collector = getattr(self, "_collect_" + self.tokenizer_type)
	return collector(recIDs, termslist)

	def _collect_recjson(self, recIDs, termslist):
	"""
	Collects terms from recjson with use of bibfield.
	Used together with recjson tokenizer.
	"""
	tokenizing_function = self.tokenizing_function
	for recID in recIDs:
	record = get_record(recID)
	if record:
	new_words = tokenizing_function(record)
	if not recID in termslist:
	termslist[recID] = []
	termslist[recID] = list_union(new_words, termslist[recID])
	return termslist

	def _collect_multifield(self, recIDs, termslist):
	"""
	Calculates terms from many fields or tags.
	Used together with multifield tokenizer
	"""
	tokenizing_function = self.tokenizing_function
	for recID in recIDs:
	new_words = tokenizing_function(recID)
	if not recID in termslist:
	termslist[recID] = []
	termslist[recID] = list_union(new_words, termslist[recID])
	return termslist

	def _collect_string(self, recIDs, termslist):
	"""
	Collects terms from specific tags or fields.
	Used together with string tokenizer.
	"""
	for tag in self.tags:
	tokenizing_function = self.special_tags.get(tag, self.tokenizing_function)
	phrases = self._get_phrases_for_tokenizing(tag, recIDs)
	for row in phrases:
	recID, phrase = row
	if recID in recIDs:
	if not recID in termslist:
	termslist[recID] = []
	new_words = tokenizing_function(phrase)
	termslist[recID] = list_union(new_words, termslist[recID])
	return termslist

	def _get_phrases_for_tokenizing(self, tag, recIDs):
	"""
	Gets phrases for later tokenization
	for a range of records and specific tag.
	@param tag: MARC tag
	@param recIDs: list of specific recIDs (not range)
	"""
	if len(recIDs) == 0:
	return ()
	bibXXx = "bib" + tag[0] + tag[1] + "x"
	bibrec_bibXXx = "bibrec_" + bibXXx
	query = """SELECT bb.id_bibrec,b.value FROM %s AS b, %s AS bb
	WHERE bb.id_bibrec BETWEEN %%s AND %%s
	AND bb.id_bibxxx=b.id AND tag LIKE %%s""" % (bibXXx, bibrec_bibXXx)
	phrases = run_sql(query, (self.first_recID, self.last_recID, tag))
	if tag == '8564_u':
	## FIXME: Quick hack to be sure that hidden files are
	## actually indexed.
	phrases = set(phrases)
	for recID in recIDs:
	for bibdocfile in BibRecDocs(recID).list_latest_files():
	phrases.add((recID, bibdocfile.get_url()))
	#authority records
	pattern = tag.replace('%', '*')
	matches = fnmatch.filter(CFG_BIBAUTHORITY_CONTROLLED_FIELDS_BIBLIOGRAPHIC.keys(), pattern)
	if not len(matches):
	return phrases
	phrases = set(phrases)
	for tag_match in matches:
	authority_tag = tag_match[0:3] + "__0"
	for recID in recIDs:
	control_nos = get_fieldvalues(recID, authority_tag)
	for control_no in control_nos:
	new_strings = get_index_strings_by_control_no(control_no)
	for string_value in new_strings:
	phrases.add((recID, string_value))
	return phrases


	class NonmarcTermCollector(TermCollector):
	"""
	TermCollector for standards other than MARC.
	Uses bibfield's records and fields.
	"""

	def __init__(self, tokenizer,
	tokenizer_type,
	table_type,
	tags,
	recIDs_range):
	super(NonmarcTermCollector, self).__init__(tokenizer,
	tokenizer_type,
	table_type,
	tags,
	recIDs_range)
	self.tokenizing_function = \
	self.tokenizer.get_nonmarc_tokenizing_function(table_type)


	def _collect_string(self, recIDs, termslist):
	"""
	Collects terms from specific tags or fields.
	Used together with string tokenizer.
	"""
	tags = self.tags
	for recID in recIDs:
	rec = get_record(recID)
	new_words = []
	extend = new_words.extend
	for tag in tags:
	tokenizing_function = self.special_tags.get(tag, self.tokenizing_function)
	phrases = []
	recjson_field = rec.get(tag)
	get_values_recursively(recjson_field, phrases)
	for phrase in phrases:
	extend(tokenizing_function(phrase))
	if recID not in termslist and new_words:
	termslist[recID] = []
	if new_words:
	termslist[recID] = list_union(new_words, termslist[recID])
	return termslist

termcollectors.pyNo OneTemporaryActions

File Metadata

termcollectors.pyView Options

Event Timeline

termcollectors.py
No OneTemporary
Actions

termcollectors.py
View Options