Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90796946
elmsubmit_html2txt.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Nov 4, 20:45
Size
5 KB
Mime Type
text/x-python
Expires
Wed, Nov 6, 20:45 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
22116919
Attached To
R3600 invenio-infoscience
elmsubmit_html2txt.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
__revision__
=
"$Id$"
import
StringIO
import
formatter
import
htmllib
import
sgmllib
import
os
from
invenio.elmsubmit_misc
import
write_to_and_return_tempfile_name
as
_write_to_and_return_tempfile_name
from
invenio.elmsubmit_misc
import
remove_tempfile
as
_remove_tempfile
from
invenio.elmsubmit_misc
import
mapmany
as
_mapmany
# Search down to ###!!! See here !!!### for editable stuff.
# Parser classes:
class
UnicodeHTMLParser
(
htmllib
.
HTMLParser
):
def
unknown_charref
(
self
,
ref
):
# Take the HTML character reference and convert it to unicode.
try
:
self
.
handle_data
(
unichr
(
int
(
ref
)))
except
(
OverflowError
,
ValueError
):
raise
HTMLParsingFailed
# myhtmlentitydefs.py should be found in the dir with this file:
from
myhtmlentitydefs
import
entitydefs
class
NativeParser
:
# NativeParser doesn't really need to be wrapped in a class, but
# we need to provide the same parser_instance.parse() interface as
# used for command line parsers.
def
parse
(
self
,
html
,
cols
):
file
=
StringIO
.
StringIO
(
u''
)
# Create HTML parser:
writer
=
formatter
.
DumbWriter
(
file
,
maxcol
=
cols
)
myformatter
=
formatter
.
AbstractFormatter
(
writer
)
p
=
UnicodeHTMLParser
(
myformatter
)
try
:
p
.
feed
(
html
)
except
sgmllib
.
SGMLParseError
:
raise
HTMLParsingFailed
p
.
close
()
return
file
.
getvalue
()
class
CLParser
:
# Provide a generic interface to command line parsers.
# We could have saved some work by avoiding writing html to a temp
# file for those command line parsers which allow input of html
# documents on stdin. However, not all of them do and a uniform
# interface was simplest.
def
__init__
(
self
,
commandline_list
):
self
.
commandline_list
=
commandline_list
def
parse
(
self
,
html
,
cols
):
if
not
isinstance
(
html
,
unicode
):
raise
UnicodeInputRequired
utf8html
=
html
.
encode
(
'utf8'
)
tf_name
=
_write_to_and_return_tempfile_name
(
utf8html
)
# Replace cols marker:
f
=
lambda
x
:
((
x
==
[
'cols'
])
and
str
(
cols
))
or
x
# Replace filename marker:
g
=
lambda
x
:
((
x
==
[
'filename'
])
and
tf_name
)
or
x
commandline_list
=
_mapmany
([
f
,
g
],
self
.
commandline_list
)
commandline
=
''
.
join
(
commandline_list
)
# Run the process using popen3; possibly dodgy on Windows!
# Need popen3 rather other popen function because we want to
# grab stderr and hide it from the clients console.
(
stdin
,
stdout
,
stderr
)
=
os
.
popen3
(
commandline
,
'r'
)
utf8output
=
stdout
.
read
()
exit_status
=
stdout
.
close
()
_remove_tempfile
(
tf_name
)
# Just in case the parser outputs bogus utf8:
# Check the return code:
if
exit_status
is
not
None
:
raise
HTMLParsingFailed
# Convert back to unicode object and return:
try
:
output
=
unicode
(
utf8output
,
'utf8'
)
return
output
except
(
LookupError
,
UnicodeError
):
raise
HTMLParsingFailed
###!!! See here !!!###
# Parsers:
parser_native
=
NativeParser
()
# These can be reinstated some time down the line when command line
# parsers have worked out their charset support a little better
# (rather than the current 'if you get lynx with this patch available
# from some guys website, then recompile...'):
# It appears w3m requires patches to support utf8:
# parser_w3m = CLParser(["w3m -dump -cols ", ['cols'], " -T 'text/html' file://", ['filename']])
# It appear lynx doesn't support charsets:
# parser_lynx = CLParser(['lynx -dump -force-html -width=', ['cols'], ' file://', ['filename']])
# elinks works OK, except it appear not to support &#{unicoderef} tags, but these are rare(ish):
# Actually, trying
# parser_elinks = CLParser([ 'elinks -dump -dump-charset "utf-8" -force-html -dump-width ', ['cols'], ' file://', ['filename']])
# The version (2.1pre13) on my system of the other 'famous' command
# line browser name links doesn't seem to have a dump option!
available_parsers
=
[
# parser_w3m,
# parser_lynx,
# parser_elinks,
parser_native
]
# Key function:
def
html2txt
(
html
,
use_parsers
=
available_parsers
,
cols
=
72
):
# Try each parser in turn (given in the list use_parsers) to see
# if they work:
for
parser
in
use_parsers
:
try
:
text
=
parser
.
parse
(
html
,
cols
)
except
HTMLParsingFailed
:
continue
else
:
return
text
# None of the parsers worked.
raise
HTMLParsingFailed
# Errors:
class
HTMLParsingFailed
(
Exception
):
"""
Raised if HTML parsing fails for any reason.
"""
pass
class
UnicodeInputRequired
(
Exception
):
"""
Raised if attempt is made to parse anything other than unicode.
"""
Event Timeline
Log In to Comment