bom_textdoc.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Jun 20, 19:55

bom_textdoc.py
View Options

	## This file is part of Invenio.
	## Copyright (C) 2007, 2008, 2009, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.


	"""BibObject Module providing BibObject prividing features for documents containing text (not necessarily as the main part of the content)"""


	from invenio.bibdocfile import BibDoc, InvenioBibDocFileError
	from invenio.dbquery import run_sql
	from datetime import datetime
	from invenio.ext.logging import register_exception
	import os

	class BibTextDoc(BibDoc):
	def get_text(self, version=None):
	"""
	@param version: the requested version. If not set, the latest version
	will be used.
	@type version: integer
	@return: the textual content corresponding to the specified version
	of the document.
	@rtype: string
	"""
	if version is None:
	version = self.get_latest_version()
	if self.has_text(version):
	return open(os.path.join(self.basedir, '.text;%i' % version)).read()
	else:
	return ""

	def get_text_path(self, version=None):
	"""
	@param version: the requested version. If not set, the latest version
	will be used.
	@type version: int
	@return: the full path to the textual content corresponding to the specified version
	of the document.
	@rtype: string
	"""
	if version is None:
	version = self.get_latest_version()
	if self.has_text(version):
	return os.path.join(self.basedir, '.text;%i' % version)
	else:
	return ""

	def extract_text(self, version=None, perform_ocr=False, ln='en'):
	"""
	Try what is necessary to extract the textual information of a document.

	@param version: the version of the document for which text is required.
	If not specified the text will be retrieved from the last version.
	@type version: integer
	@param perform_ocr: whether to perform OCR.
	@type perform_ocr: bool
	@param ln: a two letter language code to give as a hint to the OCR
	procedure.
	@type ln: string
	@raise InvenioBibDocFileError: in case of error.
	@note: the text is extracted and cached for later use. Use L{get_text}
	to retrieve it.
	"""
	from invenio.websubmit_file_converter import get_best_format_to_extract_text_from, convert_file, InvenioWebSubmitFileConverterError
	if version is None:
	version = self.get_latest_version()
	docfiles = self.list_version_files(version)
	## We try to extract text only from original or OCRed documents.
	filenames = [docfile.get_full_path() for docfile in docfiles if 'CONVERTED' not in docfile.flags or 'OCRED' in docfile.flags]
	try:
	filename = get_best_format_to_extract_text_from(filenames)
	except InvenioWebSubmitFileConverterError:
	## We fall back on considering all the documents
	filenames = [docfile.get_full_path() for docfile in docfiles]
	try:
	filename = get_best_format_to_extract_text_from(filenames)
	except InvenioWebSubmitFileConverterError:
	open(os.path.join(self.basedir, '.text;%i' % version), 'w').write('')
	return
	try:
	convert_file(filename, os.path.join(self.basedir, '.text;%i' % version), '.txt', perform_ocr=perform_ocr, ln=ln)
	if version == self.get_latest_version():
	run_sql("UPDATE bibdoc SET text_extraction_date=NOW() WHERE id=%s", (self.id, ))
	except InvenioWebSubmitFileConverterError, e:
	register_exception(alert_admin=True, prefix="Error in extracting text from bibdoc %i, version %i" % (self.id, version))
	raise InvenioBibDocFileError, str(e)

	def pdf_a_p(self):
	"""
	@return: True if this document contains a PDF in PDF/A format.
	@rtype: bool"""
	return self.has_flag('PDF/A', 'pdf')

	def has_text(self, require_up_to_date=False, version=None):
	"""
	Return True if the text of this document has already been extracted.

	@param require_up_to_date: if True check the text was actually
	extracted after the most recent format of the given version.
	@type require_up_to_date: bool
	@param version: a version for which the text should have been
	extracted. If not specified the latest version is considered.
	@type version: integer
	@return: True if the text has already been extracted.
	@rtype: bool
	"""
	if version is None:
	version = self.get_latest_version()
	if os.path.exists(os.path.join(self.basedir, '.text;%i' % version)):
	if not require_up_to_date:
	return True
	else:
	docfiles = self.list_version_files(version)
	text_md = datetime.fromtimestamp(os.path.getmtime(os.path.join(self.basedir, '.text;%i' % version)))
	for docfile in docfiles:
	if text_md <= docfile.md:
	return False
	return True
	return False
	def __repr__(self):
	return 'BibTextDoc(%s, %s, %s)' % (repr(self.id), repr(self.doctype), repr(self.human_readable))

	def supports(doctype, extensions):
	return doctype == "Fulltext" or reduce(lambda x, y: x or y.startswith(".pdf") or y.startswith(".ps") , extensions, False)

	def create_instance(docid=None, doctype='Main', human_readable=False, # pylint: disable=W0613
	initial_data = None):
	return BibTextDoc(docid=docid, human_readable=human_readable,
	initial_data = initial_data)

bom_textdoc.pyNo OneTemporaryActions

File Metadata

bom_textdoc.pyView Options

Event Timeline

bom_textdoc.py
No OneTemporary
Actions

bom_textdoc.py
View Options