text_extractor.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Jun 24, 16:28

text_extractor.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2008, 2009, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""
	BibClassify text extractor.

	This module provides method to extract the fulltext from local or remote
	documents. Currently 2 formats of documents are supported: PDF and text
	documents.

	2 methods provide the functionality of the module: text_lines_from_local_file
	and text_lines_from_url.

	This module also provides the utility 'is_pdf' that uses GNU file in order to
	determine if a local file is a PDF file.

	This module is STANDALONE safe
	"""

	import os
	import re
	import sys
	import tempfile
	import urllib2
	from invenio import bibclassify_config as bconfig

	if bconfig.STANDALONE:
	from urllib2 import urlopen
	else:
	from invenio.urlutils import make_invenio_opener
	urlopen = make_invenio_opener('BibClassify').open

	log = bconfig.get_logger("bibclassify.text_extractor")


	_ONE_WORD = re.compile("[A-Za-z]{2,}")

	def text_lines_from_local_file(document, remote=False):
	"""Returns the fulltext of the local file.
	@var document: fullpath to the file that should be read
	@var remote: boolean, if True does not count lines (gosh!)
	@return: list of lines if st was read or an empty list"""

	# FIXME - this does not care if we open anything, including binary files

	try:
	if is_pdf(document):
	if not executable_exists("pdftotext"):
	log.error("pdftotext is not available on the system.")
	cmd = "pdftotext -q -enc UTF-8 %s -" % re.escape(document)
	filestream = os.popen(cmd)
	else:
	filestream = open(document, "r")
	except IOError, ex1:
	log.error("Unable to read from file %s. (%s)" % (document, ex1.strerror))
	return []

	# FIXME - we assume it is utf-8 encoded / that is not good
	lines = [line.decode("utf-8", 'replace') for line in filestream]
	filestream.close()

	if not _is_english_text('\n'.join(lines)):
	log.warning("It seems the file '%s' is unvalid and doesn't "
	"contain text. Please communicate this file to the Invenio "
	"team." % document)

	line_nb = len(lines)
	word_nb = 0
	for line in lines:
	word_nb += len(re.findall("\S+", line))

	# Discard lines that do not contain at least one word.
	lines = [line for line in lines if _ONE_WORD.search(line) is not None]

	if not remote:
	log.info("Local file has %d lines and %d words." % (line_nb, word_nb))

	return lines

	def _is_english_text(text):
	"""
	Checks if a text is correct english.
	Computes the number of words in the text and compares it to the
	expected number of words (based on an average size of words of 5.1
	letters).

	@param text_lines: the text to analyze
	@type text_lines: string
	@return: True if the text is English, False otherwise
	@rtype: Boolean
	"""
	# Consider one word and one space.
	avg_word_length = 5.1 + 1
	expected_word_number = float(len(text)) / avg_word_length

	words = [word
	for word in re.split('\W', text)
	if word.isalpha()]

	word_number = len(words)

	return word_number > .5 * expected_word_number

	def text_lines_from_url(url, user_agent=""):
	"""Returns the fulltext of the file found at the URL."""
	request = urllib2.Request(url)
	if user_agent:
	request.add_header("User-Agent", user_agent)
	try:
	distant_stream = urlopen(request)
	# Write the URL content to a temporary file.
	local_file = tempfile.mkstemp(prefix="bibclassify.")[1]
	local_stream = open(local_file, "w")
	local_stream.write(distant_stream.read())
	local_stream.close()
	except:
	log.error("Unable to read from URL %s." % url)
	return None
	else:
	# Read lines from the temporary file.
	lines = text_lines_from_local_file(local_file, remote=True)
	os.remove(local_file)

	line_nb = len(lines)
	word_nb = 0
	for line in lines:
	word_nb += len(re.findall("\S+", line))

	log.info("Remote file has %d lines and %d words." % (line_nb, word_nb))

	return lines

	def executable_exists(executable):
	"""Tests if an executable is available on the system."""
	for directory in os.getenv("PATH").split(":"):
	if os.path.exists(os.path.join(directory, executable)):
	return True
	return False

	def is_pdf(document):
	"""Checks if a document is a PDF file. Returns True if is is."""
	if not executable_exists('pdftotext'):
	log.warning("GNU file was not found on the system. "
	"Switching to a weak file extension test.")
	if document.lower().endswith(".pdf"):
	return True
	return False
	# Tested with file version >= 4.10. First test is secure and works
	# with file version 4.25. Second condition is tested for file
	# version 4.10.
	file_output = os.popen('file ' + re.escape(document)).read()
	try:
	filetype = file_output.split(":")[1]
	except IndexError:
	log.error("Your version of the 'file' utility seems to "
	"be unsupported. Please report this to cds.support@cern.ch.")
	raise Exception('Incompatible pdftotext')

	pdf = filetype.find("PDF") > -1
	# This is how it should be done however this is incompatible with
	# file version 4.10.
	#os.popen('file -bi ' + document).read().find("application/pdf")
	return pdf

text_extractor.pyNo OneTemporaryActions

File Metadata

text_extractor.pyView Options

Event Timeline

text_extractor.py
No OneTemporary
Actions

text_extractor.py
View Options