Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F88641690
htmlparser.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Oct 19, 22:21
Size
3 KB
Mime Type
text/x-c
Expires
Mon, Oct 21, 22:21 (2 d)
Engine
blob
Format
Raw Data
Handle
21802444
Attached To
R3600 invenio-infoscience
htmlparser.py
View Options
## $Id$
## HTML parser for records.
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002, 2003, 2004, 2005 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
##read config variables
#include "config.wml"
#include "configbis.wml"
<
protect
>
## $Id$ </protect>
<
protect
>
## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
"""HTML parser for records."""
## rest of the Python code goes below
__version__
=
"$Id$"
from
config
import
*
from
search_engine
import
print_record
from
HTMLParser
import
HTMLParser
import
textwrap
from
string
import
split
WRAPWIDTH
=
72
def
wrap
(
text
):
global
WRAPWIDTH
lines
=
textwrap
.
wrap
(
text
,
WRAPWIDTH
)
r
=
''
for
l
in
lines
:
r
+=
l
+
'
\n
'
return
r
def
wrap_records
(
text
):
global
WRAPWIDTH
lines
=
split
(
text
,
'
\n
'
)
result
=
''
for
l
in
lines
:
newlines
=
textwrap
.
wrap
(
l
,
WRAPWIDTH
)
for
ll
in
newlines
:
result
+=
ll
+
'
\n
'
return
result
class
RecordHTMLParser
(
HTMLParser
):
"""A parser for the HTML returned by cdsware.search_engine.print_record.
The parser provides methods to transform the HTML returned by
cdsware.search_engine.print_record into plain text, with some
minor formatting.
"""
def
__init__
(
self
):
HTMLParser
.
__init__
(
self
)
self
.
result
=
''
def
handle_starttag
(
self
,
tag
,
attrs
):
if
tag
==
'strong'
:
# self.result += '*'
pass
elif
tag
==
'a'
:
self
.
printURL
=
0
self
.
unclosedBracket
=
0
for
f
in
attrs
:
if
f
[
1
]
==
'note'
:
self
.
result
+=
'Fulltext : <'
self
.
unclosedBracket
=
1
if
f
[
1
]
==
'moreinfo'
:
self
.
result
+=
'Detailed record : '
self
.
printURL
=
1
if
(
self
.
printURL
==
1
)
and
(
f
[
0
]
==
'href'
):
self
.
result
+=
'<'
+
f
[
1
]
+
'>'
elif
tag
==
'br'
:
self
.
result
+=
'
\n
'
def
handle_endtag
(
self
,
tag
):
if
tag
==
'strong'
:
# self.result += '\n'
pass
elif
tag
==
'a'
:
if
self
.
unclosedBracket
==
1
:
self
.
result
+=
'>'
self
.
unclosedBracket
=
0
def
handle_data
(
self
,
data
):
if
data
==
'Detailed record'
:
pass
else
:
self
.
result
+=
data
def
handle_comment
(
self
,
data
):
pass
def
get_as_text
(
record_id
):
"""Return the plain text from RecordHTMLParser of the record."""
rec
=
print_record
(
record_id
)
htparser
=
RecordHTMLParser
()
try
:
htparser
.
feed
(
rec
)
return
htparser
.
result
except
:
#htparser.close()
return
wrap
(
htparser
.
result
+
'Detailed record: <http://cdsweb.cern.ch/search.py?recid=
%s
>.'
%
record_id
)
if
__name__
==
"__main__"
:
rec
=
print_record
(
619028
)
print
rec
print
"***"
print
get_as_text
(
619028
)
Event Timeline
Log In to Comment