Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90679146
htmlparser.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Nov 3, 19:59
Size
6 KB
Mime Type
text/x-python
Expires
Tue, Nov 5, 19:59 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
22118576
Attached To
R3600 invenio-infoscience
htmlparser.py
View Options
## This file is part of Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""HTML parser for records."""
__revision__
=
"$Id$"
import
re
from
HTMLParser
import
HTMLParser
import
textwrap
import
htmlentitydefs
from
invenio.config
import
\
CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL
,
\
CFG_SITE_LANG
from
invenio.bibformat
import
format_record
from
invenio.htmlutils
import
remove_html_markup
from
invenio.messages
import
gettext_set_language
whitespaces_pattern
=
re
.
compile
(
r'[ \t]+'
)
def
wrap
(
text
):
"""Limits the number of characters per line in given text.
The function does not preserve new lines.
"""
lines
=
textwrap
.
wrap
(
text
,
CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL
)
r
=
''
for
l
in
lines
:
r
+=
l
+
'
\n
'
return
r
def
wrap_records
(
text
):
"""Limits the number of characters per line in given text.
The function preserves new lines.
"""
lines
=
text
.
split
(
'
\n
'
)
result_lines
=
[]
for
l
in
lines
:
newlines
=
textwrap
.
wrap
(
l
,
CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL
)
for
ll
in
newlines
:
result_lines
.
append
(
ll
)
return
'
\n
'
.
join
(
result_lines
)
class
RecordHTMLParser
(
HTMLParser
):
"""A parser for the HTML returned by invenio.search_engine.print_record.
The parser provides methods to transform the HTML returned by
invenio.search_engine.print_record into plain text, with some
minor formatting.
"""
silent
=
False
new_line
=
True
# Are we at the beginning of a new line? (after
# <br/>, </p> or at the beginning of the text)
def
__init__
(
self
):
HTMLParser
.
__init__
(
self
)
self
.
result
=
''
def
handle_starttag
(
self
,
tag
,
attrs
):
if
tag
==
'strong'
:
# self.result += '*'
pass
elif
tag
==
'a'
:
self
.
printURL
=
0
self
.
unclosedBracket
=
0
for
f
in
attrs
:
#if f[1] == 'note':
# self.result += 'Fulltext : <'
# self.unclosedBracket = 1
if
f
[
1
]
==
'moreinfo'
:
self
.
result
+=
'Detailed record : '
self
.
printURL
=
1
if
(
self
.
printURL
==
1
)
and
(
f
[
0
]
==
'href'
):
self
.
result
+=
'<'
+
f
[
1
]
+
'>'
elif
tag
==
'br'
:
self
.
result
+=
'
\n
'
elif
tag
==
'style'
or
tag
==
'script'
:
self
.
silent
=
True
elif
tag
==
'p'
:
if
not
self
.
new_line
:
self
.
result
+=
'
\n
'
self
.
new_line
=
True
def
handle_startendtag
(
self
,
tag
,
attrs
):
if
tag
==
'br'
:
self
.
result
+=
'
\n
'
self
.
new_line
=
True
def
handle_endtag
(
self
,
tag
):
if
tag
==
'strong'
:
# self.result += '\n'
pass
elif
tag
==
'a'
:
if
self
.
unclosedBracket
==
1
:
self
.
result
+=
'>'
self
.
unclosedBracket
=
0
elif
tag
==
'style'
or
tag
==
'script'
:
self
.
silent
=
False
elif
tag
==
'p'
:
self
.
result
+=
'
\n
'
self
.
new_line
=
True
def
handle_data
(
self
,
data
):
if
data
.
lower
()
in
[
'detailed record'
,
'similar record'
,
'cited by'
]:
pass
elif
self
.
silent
==
False
:
if
self
.
new_line
:
# Remove unnecessary trailing whitespace at the
# beginning of a line
data
=
data
.
lstrip
()
# Merge all consecutive whitespaces into a single one
self
.
result
+=
data
self
.
new_line
=
False
def
handle_comment
(
self
,
data
):
if
'START_NOT_FOR_TEXT'
==
data
.
upper
()
.
strip
():
self
.
silent
=
True
elif
'END_NOT_FOR_TEXT'
==
data
.
upper
()
.
strip
():
self
.
silent
=
False
def
handle_charref
(
self
,
name
):
"""Process character references of the form "&#ref;". Transform to text whenever possible."""
try
:
self
.
result
+=
unichr
(
int
(
name
))
.
encode
(
"utf-8"
)
except
:
return
def
handle_entityref
(
self
,
name
):
"""Process a general entity reference of the form "&name;".
Transform to text whenever possible."""
if
name
==
'nbsp'
:
# Keep them for the moment. Will be processed at the end.
# It is needed to consider them separately, as all
# consecutive whitespaces will be merged into a single one
# at the end. It should not be the case of non breakable
# space. Note that because of this trick, input
# &nbsp; will be considered as a
self
.
result
+=
' '
return
char_code
=
htmlentitydefs
.
name2codepoint
.
get
(
name
,
None
)
if
char_code
is
not
None
:
try
:
self
.
result
+=
unichr
(
char_code
)
.
encode
(
"utf-8"
)
except
:
return
def
get_as_text
(
record_id
=
0
,
xml_record
=
None
,
ln
=
CFG_SITE_LANG
):
"""Return the record in a textual format"""
_
=
gettext_set_language
(
ln
)
out
=
""
if
record_id
!=
0
:
rec_in_hb
=
format_record
(
record_id
,
of
=
"hb"
)
elif
xml_record
:
rec_in_hb
=
format_record
(
0
,
of
=
"hb"
,
xml_record
=
xml_record
)
rec_in_hb
=
rec_in_hb
.
replace
(
'
\n
'
,
' '
)
htparser
=
RecordHTMLParser
()
try
:
htparser
.
feed
(
rec_in_hb
)
htparser
.
close
()
out
=
htparser
.
result
except
:
out
=
remove_html_markup
(
rec_in_hb
)
# Remove trailing whitespace and linefeeds
out
=
out
.
strip
(
'
\n
'
)
.
strip
()
# Merge consecutive whitespaces. Must be done here, once all HTML
# tags have been removed
out
=
whitespaces_pattern
.
sub
(
' '
,
out
)
# Now consider non-breakable spaces
out
=
out
.
replace
(
' '
,
' '
)
out
=
re
.
sub
(
r"[\-:]?\s*
%s
\s*[\-:]?"
%
_
(
"Detailed record"
),
""
,
out
)
out
=
re
.
sub
(
r"[\-:]?\s*
%s
\s*[\-:]?"
%
_
(
"Similar records"
),
""
,
out
)
out
=
re
.
sub
(
r"[\-:]?\s*
%s
\s*[\-:]?"
%
_
(
"Cited by"
),
""
,
out
)
return
out
.
strip
()
Event Timeline
Log In to Comment