Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F87729283
BibIndexYearTokenizer.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Oct 14, 14:16
Size
2 KB
Mime Type
text/x-python
Expires
Wed, Oct 16, 14:16 (2 d)
Engine
blob
Format
Raw Data
Handle
21644896
Attached To
R3600 invenio-infoscience
BibIndexYearTokenizer.py
View Options
# -*- coding:utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2010, 2011, 2012 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""BibIndexYearTokenizer: useful for year index. Extracts words (year) from date tags.
"""
from
invenio.config
import
\
CFG_INSPIRE_SITE
from
invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer
import
BibIndexDefaultTokenizer
class
BibIndexYearTokenizer
(
BibIndexDefaultTokenizer
):
"""
Year tokenizer. It tokenizes words from date tags or uses default word tokenizer.
"""
def
__init__
(
self
,
stemming_language
=
None
,
remove_stopwords
=
False
,
remove_html_markup
=
False
,
remove_latex_markup
=
False
):
BibIndexDefaultTokenizer
.
__init__
(
self
,
stemming_language
,
remove_stopwords
,
remove_html_markup
,
remove_latex_markup
)
def
get_words_from_date_tag
(
self
,
datestring
):
"""
Special procedure to index words from tags storing date-like
information in format YYYY or YYYY-MM or YYYY-MM-DD. Namely, we
are indexing word-terms YYYY, YYYY-MM, YYYY-MM-DD, but never
standalone MM or DD.
"""
out
=
[]
for
dateword
in
datestring
.
split
():
# maybe there are whitespaces, so break these too
out
.
append
(
dateword
)
parts
=
dateword
.
split
(
'-'
)
for
nb
in
range
(
1
,
len
(
parts
)):
out
.
append
(
"-"
.
join
(
parts
[:
nb
]))
return
out
def
tokenize_for_words_default
(
self
,
phrase
):
"""Default tokenize_for_words inherited from default tokenizer"""
return
super
(
BibIndexYearTokenizer
,
self
)
.
tokenize_for_words
(
phrase
)
def
tokenize_for_words
(
self
,
phrase
):
"""
If CFG_INSPIRE_SITE is 1 we perform special tokenization which relies on getting words form date tag.
In other case we perform default tokenization.
"""
if
CFG_INSPIRE_SITE
:
return
self
.
get_words_from_date_tag
(
phrase
)
else
:
return
self
.
tokenize_for_words_default
(
phrase
)
Event Timeline
Log In to Comment