File Metadata

Created: Sun, Jun 30, 17:13

htmlparser.py
View Options

	## This file is part of CDS Invenio.
	## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
	##
	## CDS Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## CDS Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""HTML parser for records."""

	__revision__ = "$Id$"

	import re
	from HTMLParser import HTMLParser
	from string import split
	import textwrap
	import htmlentitydefs

	from invenio.config import \
	CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL, \
	CFG_SITE_LANG
	from invenio.bibformat import format_record
	from invenio.bibindex_engine import re_html
	from invenio.messages import gettext_set_language

	def wrap(text):
	"""Limits the number of characters per line in given text.
	The function does not preserve new lines.
	"""
	lines = textwrap.wrap(text, CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL)
	r = ''
	for l in lines:
	r += l + '\n'
	return r

	def wrap_records(text):
	"""Limits the number of characters per line in given text.
	The function preserves new lines."""
	lines = split(text, '\n')
	result = ''
	for l in lines:
	newlines = textwrap.wrap(l, CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL)
	for ll in newlines:
	result += ll + '\n'
	return result

	class RecordHTMLParser(HTMLParser):
	"""A parser for the HTML returned by invenio.search_engine.print_record.

	The parser provides methods to transform the HTML returned by
	invenio.search_engine.print_record into plain text, with some
	minor formatting.
	"""

	silent = False

	def __init__(self):
	HTMLParser.__init__(self)
	self.result = ''

	def handle_starttag(self, tag, attrs):
	if tag == 'strong':
	# self.result += '*'
	pass
	elif tag == 'a':
	self.printURL = 0
	self.unclosedBracket = 0
	for f in attrs:
	if f[1] == 'note':
	self.result += 'Fulltext : <'
	self.unclosedBracket = 1
	if f[1] == 'moreinfo':
	self.result += 'Detailed record : '
	self.printURL = 1
	if (self.printURL == 1) and (f[0] == 'href'):
	self.result += '<' + f[1] + '>'
	elif tag == 'br':
	self.result += '\n'
	elif tag == 'style' or tag == 'script':
	self.silent = True

	def handle_endtag(self, tag):
	if tag == 'strong':
	# self.result += '\n'
	pass
	elif tag == 'a':
	if self.unclosedBracket == 1:
	self.result += '>'
	self.unclosedBracket = 0
	elif tag == 'style' or tag == 'script':
	self.silent = False

	def handle_data(self, data):
	if data.lower() in ['detailed record', 'similar record', 'cited by']:
	pass
	elif self.silent == False:
	self.result += data

	def handle_comment(self, data):
	if 'START_NOT_FOR_TEXT' == data.upper().strip():
	self.silent = True
	elif 'END_NOT_FOR_TEXT' == data.upper().strip():
	self.silent = False

	def handle_charref(self, name):
	"""Process character references of the form "&#ref;". Transform to text whenever possible."""
	try:
	self.result += unichr(int(name)).encode("utf-8")
	except:
	return

	def handle_entityref(self, name):
	"""Process a general entity reference of the form "&name;".
	Transform to text whenever possible."""
	char_code = htmlentitydefs.name2codepoint.get(name, None)
	if char_code is not None:
	try:
	self.result += unichr(char_code).encode("utf-8")
	except:
	return

	def get_as_text(record_id, ln=CFG_SITE_LANG):
	"""Return the record in a textual format"""
	_ = gettext_set_language(ln)
	out = ""
	rec_in_hb = format_record(record_id, of="hb")
	rec_in_hb = rec_in_hb.replace('\n', ' ')
	htparser = RecordHTMLParser()
	try:
	htparser.feed(rec_in_hb)
	htparser.close()
	out = htparser.result
	except:
	out = re_html.sub(' ', rec_in_hb)
	out = re.sub(r"[\-:]?\s%s\s[\-:]?" % _("Detailed record"), "", out)
	out = re.sub(r"[\-:]?\s%s\s[\-:]?" % _("Similar records"), "", out)
	out = re.sub(r"[\-:]?\s%s\s[\-:]?" % _("Cited by"), "", out)
	return out.strip()

htmlparser.py
No OneTemporary
Actions

File Metadata

htmlparser.py
View Options

Event Timeline

htmlparser.pyNo OneTemporaryActions

File Metadata

htmlparser.pyView Options

Event Timeline

htmlparser.py
No OneTemporary
Actions

htmlparser.py
View Options