Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F102620713
BibIndexFulltextTokenizer.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Feb 22, 14:51
Size
9 KB
Mime Type
text/x-python
Expires
Mon, Feb 24, 14:51 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
24375045
Attached To
R3600 invenio-infoscience
BibIndexFulltextTokenizer.py
View Options
# -*- coding:utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2010, 2011, 2012 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""BibIndexFulltextTokenizer: extracts words form a given document.
Document is given by its URL.
"""
import
os
import
sys
import
logging
import
urllib2
import
re
from
invenio.config
import
\
CFG_SOLR_URL
,
\
CFG_XAPIAN_ENABLED
,
\
CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY
,
\
CFG_BIBINDEX_SPLASH_PAGES
from
invenio.htmlutils
import
get_links_in_html_page
from
invenio.websubmit_file_converter
import
convert_file
,
get_file_converter_logger
from
invenio.solrutils_bibindex_indexer
import
solr_add_fulltext
from
invenio.xapianutils_bibindex_indexer
import
xapian_add
from
invenio.bibdocfile
import
bibdocfile_url_p
,
\
bibdocfile_url_to_bibdoc
,
download_url
,
\
BibRecDocs
from
invenio.bibindexadminlib
import
get_idx_indexer
from
invenio.bibtask
import
write_message
from
invenio.errorlib
import
register_exception
from
invenio.intbitset
import
intbitset
from
invenio.bibindex_tokenizers.BibIndexDefaultTokenizer
import
BibIndexDefaultTokenizer
fulltext_added
=
intbitset
()
# stores ids of records whose fulltexts have been added
class
BibIndexFulltextTokenizer
(
BibIndexDefaultTokenizer
):
"""
Exctracts all the words contained in document specified by url.
"""
def
__init__
(
self
,
stemming_language
=
None
,
remove_stopwords
=
False
,
remove_html_markup
=
False
,
remove_latex_markup
=
False
):
self
.
verbose
=
3
BibIndexDefaultTokenizer
.
__init__
(
self
,
stemming_language
,
remove_stopwords
,
remove_html_markup
,
remove_latex_markup
)
def
set_verbose
(
self
,
verbose
):
"""Allows to change verbosity level during indexing"""
self
.
verbose
=
verbose
def
tokenize_for_words_default
(
self
,
phrase
):
"""Default tokenize_for_words inherited from default tokenizer"""
return
super
(
BibIndexFulltextTokenizer
,
self
)
.
tokenize_for_words
(
phrase
)
def
get_words_from_fulltext
(
self
,
url_direct_or_indirect
):
"""Returns all the words contained in the document specified by
URL_DIRECT_OR_INDIRECT with the words being split by various
SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is
set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
file. (This is interesting to index Indico for example.) Note
also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
the fulltext file or an URL to a setlink-like page body that
presents the links to be indexed. In the latter case the
URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
to fulltext documents, for all knows file extensions as
specified by global CONV_PROGRAMS config variable.
"""
write_message
(
"... reading fulltext files from
%s
started"
%
url_direct_or_indirect
,
verbose
=
2
)
try
:
if
bibdocfile_url_p
(
url_direct_or_indirect
):
write_message
(
"...
%s
is an internal document"
%
url_direct_or_indirect
,
verbose
=
2
)
bibdoc
=
bibdocfile_url_to_bibdoc
(
url_direct_or_indirect
)
indexer
=
get_idx_indexer
(
'fulltext'
)
if
indexer
!=
'native'
:
# A document might belong to multiple records
for
rec_link
in
bibdoc
.
bibrec_links
:
recid
=
rec_link
[
"recid"
]
# Adds fulltexts of all files once per records
if
not
recid
in
fulltext_added
:
bibrecdocs
=
BibRecDocs
(
recid
)
text
=
bibrecdocs
.
get_text
()
if
indexer
==
'SOLR'
and
CFG_SOLR_URL
:
solr_add_fulltext
(
recid
,
text
)
elif
indexer
==
'XAPIAN'
and
CFG_XAPIAN_ENABLED
:
xapian_add
(
recid
,
'fulltext'
,
text
)
fulltext_added
.
add
(
recid
)
# we are relying on an external information retrieval system
# to provide full-text indexing, so dispatch text to it and
# return nothing here:
return
[]
else
:
text
=
""
if
hasattr
(
bibdoc
,
"get_text"
):
text
=
bibdoc
.
get_text
()
return
self
.
tokenize_for_words_default
(
text
)
else
:
if
CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY
:
write_message
(
"...
%s
is external URL but indexing only local files"
%
url_direct_or_indirect
,
verbose
=
2
)
return
[]
write_message
(
"...
%s
is an external URL"
%
url_direct_or_indirect
,
verbose
=
2
)
urls_to_index
=
set
()
for
splash_re
,
url_re
in
CFG_BIBINDEX_SPLASH_PAGES
.
iteritems
():
if
re
.
match
(
splash_re
,
url_direct_or_indirect
):
write_message
(
"...
%s
is a splash page (
%s
)"
%
(
url_direct_or_indirect
,
splash_re
),
verbose
=
2
)
html
=
urllib2
.
urlopen
(
url_direct_or_indirect
)
.
read
()
urls
=
get_links_in_html_page
(
html
)
write_message
(
"... found these URLs in
%s
splash page:
%s
"
%
(
url_direct_or_indirect
,
", "
.
join
(
urls
)),
verbose
=
3
)
for
url
in
urls
:
if
re
.
match
(
url_re
,
url
):
write_message
(
"... will index
%s
(matched by
%s
)"
%
(
url
,
url_re
),
verbose
=
2
)
urls_to_index
.
add
(
url
)
if
not
urls_to_index
:
urls_to_index
.
add
(
url_direct_or_indirect
)
write_message
(
"... will extract words from
%s
"
%
', '
.
join
(
urls_to_index
),
verbose
=
2
)
words
=
{}
for
url
in
urls_to_index
:
tmpdoc
=
download_url
(
url
)
file_converter_logger
=
get_file_converter_logger
()
old_logging_level
=
file_converter_logger
.
getEffectiveLevel
()
if
self
.
verbose
>
3
:
file_converter_logger
.
setLevel
(
logging
.
DEBUG
)
try
:
try
:
tmptext
=
convert_file
(
tmpdoc
,
output_format
=
'.txt'
)
text
=
open
(
tmptext
)
.
read
()
os
.
remove
(
tmptext
)
indexer
=
get_idx_indexer
(
'fulltext'
)
if
indexer
!=
'native'
:
if
indexer
==
'SOLR'
and
CFG_SOLR_URL
:
solr_add_fulltext
(
None
,
text
)
# FIXME: use real record ID
if
indexer
==
'XAPIAN'
and
CFG_XAPIAN_ENABLED
:
#xapian_add(None, 'fulltext', text) # FIXME: use real record ID
pass
# we are relying on an external information retrieval system
# to provide full-text indexing, so dispatch text to it and
# return nothing here:
tmpwords
=
[]
else
:
tmpwords
=
self
.
tokenize_for_words_default
(
text
)
words
.
update
(
dict
(
map
(
lambda
x
:
(
x
,
1
),
tmpwords
)))
except
Exception
,
e
:
message
=
'ERROR: it
\'
s impossible to correctly extract words from
%s
referenced by
%s
:
%s
'
%
(
url
,
url_direct_or_indirect
,
e
)
register_exception
(
prefix
=
message
,
alert_admin
=
True
)
write_message
(
message
,
stream
=
sys
.
stderr
)
finally
:
os
.
remove
(
tmpdoc
)
if
self
.
verbose
>
3
:
file_converter_logger
.
setLevel
(
old_logging_level
)
return
words
.
keys
()
except
Exception
,
e
:
message
=
'ERROR: it
\'
s impossible to correctly extract words from
%s
:
%s
'
%
(
url_direct_or_indirect
,
e
)
register_exception
(
prefix
=
message
,
alert_admin
=
True
)
write_message
(
message
,
stream
=
sys
.
stderr
)
return
[]
def
tokenize_for_words
(
self
,
phrase
):
return
self
.
get_words_from_fulltext
(
phrase
)
Event Timeline
Log In to Comment