Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F69200194
htmlparser.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Jun 30, 17:13
Size
4 KB
Mime Type
text/x-python
Expires
Tue, Jul 2, 17:13 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
18682289
Attached To
R3600 invenio-infoscience
htmlparser.py
View Options
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""HTML parser for records."""
__revision__
=
"$Id$"
import
re
from
HTMLParser
import
HTMLParser
from
string
import
split
import
textwrap
import
htmlentitydefs
from
invenio.config
import
\
CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL
,
\
CFG_SITE_LANG
from
invenio.bibformat
import
format_record
from
invenio.bibindex_engine
import
re_html
from
invenio.messages
import
gettext_set_language
def
wrap
(
text
):
"""Limits the number of characters per line in given text.
The function does not preserve new lines.
"""
lines
=
textwrap
.
wrap
(
text
,
CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL
)
r
=
''
for
l
in
lines
:
r
+=
l
+
'
\n
'
return
r
def
wrap_records
(
text
):
"""Limits the number of characters per line in given text.
The function preserves new lines."""
lines
=
split
(
text
,
'
\n
'
)
result
=
''
for
l
in
lines
:
newlines
=
textwrap
.
wrap
(
l
,
CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL
)
for
ll
in
newlines
:
result
+=
ll
+
'
\n
'
return
result
class
RecordHTMLParser
(
HTMLParser
):
"""A parser for the HTML returned by invenio.search_engine.print_record.
The parser provides methods to transform the HTML returned by
invenio.search_engine.print_record into plain text, with some
minor formatting.
"""
silent
=
False
def
__init__
(
self
):
HTMLParser
.
__init__
(
self
)
self
.
result
=
''
def
handle_starttag
(
self
,
tag
,
attrs
):
if
tag
==
'strong'
:
# self.result += '*'
pass
elif
tag
==
'a'
:
self
.
printURL
=
0
self
.
unclosedBracket
=
0
for
f
in
attrs
:
if
f
[
1
]
==
'note'
:
self
.
result
+=
'Fulltext : <'
self
.
unclosedBracket
=
1
if
f
[
1
]
==
'moreinfo'
:
self
.
result
+=
'Detailed record : '
self
.
printURL
=
1
if
(
self
.
printURL
==
1
)
and
(
f
[
0
]
==
'href'
):
self
.
result
+=
'<'
+
f
[
1
]
+
'>'
elif
tag
==
'br'
:
self
.
result
+=
'
\n
'
elif
tag
==
'style'
or
tag
==
'script'
:
self
.
silent
=
True
def
handle_endtag
(
self
,
tag
):
if
tag
==
'strong'
:
# self.result += '\n'
pass
elif
tag
==
'a'
:
if
self
.
unclosedBracket
==
1
:
self
.
result
+=
'>'
self
.
unclosedBracket
=
0
elif
tag
==
'style'
or
tag
==
'script'
:
self
.
silent
=
False
def
handle_data
(
self
,
data
):
if
data
.
lower
()
in
[
'detailed record'
,
'similar record'
,
'cited by'
]:
pass
elif
self
.
silent
==
False
:
self
.
result
+=
data
def
handle_comment
(
self
,
data
):
if
'START_NOT_FOR_TEXT'
==
data
.
upper
()
.
strip
():
self
.
silent
=
True
elif
'END_NOT_FOR_TEXT'
==
data
.
upper
()
.
strip
():
self
.
silent
=
False
def
handle_charref
(
self
,
name
):
"""Process character references of the form "&#ref;". Transform to text whenever possible."""
try
:
self
.
result
+=
unichr
(
int
(
name
))
.
encode
(
"utf-8"
)
except
:
return
def
handle_entityref
(
self
,
name
):
"""Process a general entity reference of the form "&name;".
Transform to text whenever possible."""
char_code
=
htmlentitydefs
.
name2codepoint
.
get
(
name
,
None
)
if
char_code
is
not
None
:
try
:
self
.
result
+=
unichr
(
char_code
)
.
encode
(
"utf-8"
)
except
:
return
def
get_as_text
(
record_id
,
ln
=
CFG_SITE_LANG
):
"""Return the record in a textual format"""
_
=
gettext_set_language
(
ln
)
out
=
""
rec_in_hb
=
format_record
(
record_id
,
of
=
"hb"
)
rec_in_hb
=
rec_in_hb
.
replace
(
'
\n
'
,
' '
)
htparser
=
RecordHTMLParser
()
try
:
htparser
.
feed
(
rec_in_hb
)
htparser
.
close
()
out
=
htparser
.
result
except
:
out
=
re_html
.
sub
(
' '
,
rec_in_hb
)
out
=
re
.
sub
(
r"[\-:]?\s*
%s
\s*[\-:]?"
%
_
(
"Detailed record"
),
""
,
out
)
out
=
re
.
sub
(
r"[\-:]?\s*
%s
\s*[\-:]?"
%
_
(
"Similar records"
),
""
,
out
)
out
=
re
.
sub
(
r"[\-:]?\s*
%s
\s*[\-:]?"
%
_
(
"Cited by"
),
""
,
out
)
return
out
.
strip
()
Event Timeline
Log In to Comment