File Metadata

Created: Thu, May 9, 21:32

htmlparser.py
View Options

	## $Id$
	## HTML parser for records.

	## This file is part of the CERN Document Server Software (CDSware).
	## Copyright (C) 2002 CERN.
	##
	## The CDSware is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## The CDSware is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDSware; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	##read config variables
	#include "config.wml"
	#include "configbis.wml"

	<protect>## $Id$ </protect>
	<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
	"""HTML parser for records."""

	## rest of the Python code goes below

	__version__ = "$Id$"

	try:
	#from config import *
	from search_engine import print_record
	from HTMLParser import HTMLParser
	except ImportError, e:
	print "Error: %s" % e
	import sys
	sys.exit(1)


	class RecordHTMLParser(HTMLParser):
	"""A parser for the HTML returned by cdsware.search_engine.print_record.

	The parser provides methods to transform the HTML returned by
	cdsware.search_engine.print_record into plain text, with some
	minor formatting.
	"""

	def __init__(self):
	HTMLParser.__init__(self)
	self.result = ''

	def handle_starttag(self, tag, attrs):
	if tag == 'strong':
	# self.result += '*'
	pass
	elif tag == 'a':
	self.printURL = 0
	self.unclosedBracket = 0
	for f in attrs:
	if f[1] == 'note':
	self.result += 'Fulltext : <'
	self.unclosedBracket = 1
	if f[1] == 'moreinfo':
	self.result += 'Detailed record : '
	self.printURL = 1
	if (self.printURL == 1) and (f[0] == 'href'):
	self.result += '<' + f[1] + '>'

	elif tag == 'br':
	self.result += '\n'

	def handle_endtag(self, tag):
	if tag == 'strong':
	# self.result += '\n'
	pass
	elif tag == 'a':
	if self.unclosedBracket == 1:
	self.result += '>'
	self.unclosedBracket = 0

	def handle_data(self, data):
	if data == 'Detailed record':
	pass
	else:
	self.result += data

	def get_as_text(record_id):
	"""Return the plain text from RecordHTMLParser of the record."""
	rec = print_record(record_id)
	htparser = RecordHTMLParser()
	htparser.feed(rec)
	return htparser.result


	if __name__ == "__main__":
	rec = print_record(1)
	print rec

	print "***"

	print get_as_text(1)

htmlparser.py
No OneTemporary
Actions

File Metadata

htmlparser.py
View Options

Event Timeline

htmlparser.pyNo OneTemporaryActions

File Metadata

htmlparser.pyView Options

Event Timeline

htmlparser.py
No OneTemporary
Actions

htmlparser.py
View Options