BibIndexFulltextTokenizer.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Jul 2, 09:08

BibIndexFulltextTokenizer.py
View Options

	# -- coding:utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2010, 2011, 2012 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""BibIndexFulltextTokenizer: extracts words form a given document.
	Document is given by its URL.
	"""

	import os
	import sys
	import logging
	import urllib2
	import re


	from invenio.config import \
	CFG_SOLR_URL, \
	CFG_XAPIAN_ENABLED, \
	CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY, \
	CFG_BIBINDEX_SPLASH_PAGES
	from invenio.htmlutils import get_links_in_html_page
	from invenio.websubmit_file_converter import convert_file, get_file_converter_logger
	from invenio.solrutils_bibindex_indexer import solr_add_fulltext
	from invenio.xapianutils_bibindex_indexer import xapian_add
	from invenio.bibdocfile import bibdocfile_url_p, \
	bibdocfile_url_to_bibdoc, download_url, \
	BibRecDocs
	from invenio.bibindexadminlib import get_idx_indexer
	from invenio.bibtask import write_message
	from invenio.errorlib import register_exception
	from invenio.intbitset import intbitset
	from invenio.bibindex_tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer


	fulltext_added = intbitset() # stores ids of records whose fulltexts have been added




	class BibIndexFulltextTokenizer(BibIndexDefaultTokenizer):
	"""
	Exctracts all the words contained in document specified by url.
	"""

	def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
	self.verbose = 3
	BibIndexDefaultTokenizer.__init__(self, stemming_language,
	remove_stopwords,
	remove_html_markup,
	remove_latex_markup)

	def set_verbose(self, verbose):
	"""Allows to change verbosity level during indexing"""
	self.verbose = verbose

	def tokenize_for_words_default(self, phrase):
	"""Default tokenize_for_words inherited from default tokenizer"""
	return super(BibIndexFulltextTokenizer, self).tokenize_for_words(phrase)


	def get_words_from_fulltext(self, url_direct_or_indirect):
	"""Returns all the words contained in the document specified by
	URL_DIRECT_OR_INDIRECT with the words being split by various
	SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is
	set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
	file. (This is interesting to index Indico for example.) Note
	also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
	the fulltext file or an URL to a setlink-like page body that
	presents the links to be indexed. In the latter case the
	URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
	to fulltext documents, for all knows file extensions as
	specified by global CONV_PROGRAMS config variable.
	"""
	write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2)
	try:
	if bibdocfile_url_p(url_direct_or_indirect):
	write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2)
	bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect)
	indexer = get_idx_indexer('fulltext')
	if indexer != 'native':
	# A document might belong to multiple records
	for rec_link in bibdoc.bibrec_links:
	recid = rec_link["recid"]
	# Adds fulltexts of all files once per records
	if not recid in fulltext_added:
	bibrecdocs = BibRecDocs(recid)
	text = bibrecdocs.get_text()
	if indexer == 'SOLR' and CFG_SOLR_URL:
	solr_add_fulltext(recid, text)
	elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
	xapian_add(recid, 'fulltext', text)

	fulltext_added.add(recid)
	# we are relying on an external information retrieval system
	# to provide full-text indexing, so dispatch text to it and
	# return nothing here:
	return []
	else:
	text = ""
	if hasattr(bibdoc, "get_text"):
	text = bibdoc.get_text()
	return self.tokenize_for_words_default(text)
	else:
	if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY:
	write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2)
	return []
	write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2)
	urls_to_index = set()
	for splash_re, url_re in CFG_BIBINDEX_SPLASH_PAGES.iteritems():
	if re.match(splash_re, url_direct_or_indirect):
	write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2)
	html = urllib2.urlopen(url_direct_or_indirect).read()
	urls = get_links_in_html_page(html)
	write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3)
	for url in urls:
	if re.match(url_re, url):
	write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2)
	urls_to_index.add(url)
	if not urls_to_index:
	urls_to_index.add(url_direct_or_indirect)
	write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2)
	words = {}
	for url in urls_to_index:
	tmpdoc = download_url(url)
	file_converter_logger = get_file_converter_logger()
	old_logging_level = file_converter_logger.getEffectiveLevel()
	if self.verbose > 3:
	file_converter_logger.setLevel(logging.DEBUG)
	try:
	try:
	tmptext = convert_file(tmpdoc, output_format='.txt')
	text = open(tmptext).read()
	os.remove(tmptext)

	indexer = get_idx_indexer('fulltext')
	if indexer != 'native':
	if indexer == 'SOLR' and CFG_SOLR_URL:
	solr_add_fulltext(None, text) # FIXME: use real record ID
	if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
	#xapian_add(None, 'fulltext', text) # FIXME: use real record ID
	pass
	# we are relying on an external information retrieval system
	# to provide full-text indexing, so dispatch text to it and
	# return nothing here:
	tmpwords = []
	else:
	tmpwords = self.tokenize_for_words_default(text)
	words.update(dict(map(lambda x: (x, 1), tmpwords)))
	except Exception, e:
	message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e)
	register_exception(prefix=message, alert_admin=True)
	write_message(message, stream=sys.stderr)
	finally:
	os.remove(tmpdoc)
	if self.verbose > 3:
	file_converter_logger.setLevel(old_logging_level)
	return words.keys()
	except Exception, e:
	message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e)
	register_exception(prefix=message, alert_admin=True)
	write_message(message, stream=sys.stderr)
	return []


	def tokenize_for_words(self, phrase):
	return self.get_words_from_fulltext(phrase)

BibIndexFulltextTokenizer.pyNo OneTemporaryActions

File Metadata

BibIndexFulltextTokenizer.pyView Options

Event Timeline

BibIndexFulltextTokenizer.py
No OneTemporary
Actions

BibIndexFulltextTokenizer.py
View Options