Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90386711
engine_washer.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Nov 1, 04:49
Size
7 KB
Mime Type
text/x-python
Expires
Sun, Nov 3, 04:49 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
22066098
Attached To
R3600 invenio-infoscience
engine_washer.py
View Options
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2009, 2010, 2011, 2012, 2013, 2015 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
import
re
from
invenio.legacy.bibindex.engine_stemmer
import
stem
from
invenio.legacy.bibindex.engine_stopwords
import
is_stopword
from
invenio.config
import
CFG_BIBINDEX_MIN_WORD_LENGTH
,
\
CFG_ETCDIR
re_pattern_fuzzy_author_dots
=
re
.
compile
(
r'[\.\-]+'
)
re_pattern_fuzzy_author_spaces
=
re
.
compile
(
r'\s+'
)
re_pattern_author_canonical_id
=
re
.
compile
(
r'\.[0-9]+$'
)
re_unicode_lowercase_a
=
re
.
compile
(
unicode
(
r"(?u)[áàäâãå]"
,
"utf-8"
))
re_unicode_lowercase_ae
=
re
.
compile
(
unicode
(
r"(?u)[æ]"
,
"utf-8"
))
re_unicode_lowercase_e
=
re
.
compile
(
unicode
(
r"(?u)[éèëê]"
,
"utf-8"
))
re_unicode_lowercase_i
=
re
.
compile
(
unicode
(
r"(?u)[íìïî]"
,
"utf-8"
))
re_unicode_lowercase_o
=
re
.
compile
(
unicode
(
r"(?u)[óòöôõø]"
,
"utf-8"
))
re_unicode_lowercase_u
=
re
.
compile
(
unicode
(
r"(?u)[úùüû]"
,
"utf-8"
))
re_unicode_lowercase_y
=
re
.
compile
(
unicode
(
r"(?u)[ýÿ]"
,
"utf-8"
))
re_unicode_lowercase_c
=
re
.
compile
(
unicode
(
r"(?u)[çć]"
,
"utf-8"
))
re_unicode_lowercase_n
=
re
.
compile
(
unicode
(
r"(?u)[ñ]"
,
"utf-8"
))
re_unicode_uppercase_a
=
re
.
compile
(
unicode
(
r"(?u)[ÁÀÄÂÃÅ]"
,
"utf-8"
))
re_unicode_uppercase_ae
=
re
.
compile
(
unicode
(
r"(?u)[Æ]"
,
"utf-8"
))
re_unicode_uppercase_e
=
re
.
compile
(
unicode
(
r"(?u)[ÉÈËÊ]"
,
"utf-8"
))
re_unicode_uppercase_i
=
re
.
compile
(
unicode
(
r"(?u)[ÍÌÏÎ]"
,
"utf-8"
))
re_unicode_uppercase_o
=
re
.
compile
(
unicode
(
r"(?u)[ÓÒÖÔÕØ]"
,
"utf-8"
))
re_unicode_uppercase_u
=
re
.
compile
(
unicode
(
r"(?u)[ÚÙÜÛ]"
,
"utf-8"
))
re_unicode_uppercase_y
=
re
.
compile
(
unicode
(
r"(?u)[Ý]"
,
"utf-8"
))
re_unicode_uppercase_c
=
re
.
compile
(
unicode
(
r"(?u)[ÇĆ]"
,
"utf-8"
))
re_unicode_uppercase_n
=
re
.
compile
(
unicode
(
r"(?u)[Ñ]"
,
"utf-8"
))
re_latex_lowercase_a
=
re
.
compile
(
"
\\\\
[
\"
H'`~^vu=k]\{?a\}?"
)
re_latex_lowercase_ae
=
re
.
compile
(
"
\\\\
ae
\\
{
\\
}?"
)
re_latex_lowercase_e
=
re
.
compile
(
"
\\\\
[
\"
H'`~^vu=k]
\\
{?e
\\
}?"
)
re_latex_lowercase_i
=
re
.
compile
(
"
\\\\
[
\"
H'`~^vu=k]
\\
{?i
\\
}?"
)
re_latex_lowercase_o
=
re
.
compile
(
"
\\\\
[
\"
H'`~^vu=k]
\\
{?o
\\
}?"
)
re_latex_lowercase_u
=
re
.
compile
(
"
\\\\
[
\"
H'`~^vu=k]
\\
{?u
\\
}?"
)
re_latex_lowercase_y
=
re
.
compile
(
"
\\\\
[
\"
']
\\
{?y
\\
}?"
)
re_latex_lowercase_c
=
re
.
compile
(
"
\\\\
['uc]
\\
{?c
\\
}?"
)
re_latex_lowercase_n
=
re
.
compile
(
"
\\\\
[c'~^vu]
\\
{?n
\\
}?"
)
re_latex_uppercase_a
=
re
.
compile
(
"
\\\\
[
\"
H'`~^vu=k]
\\
{?A
\\
}?"
)
re_latex_uppercase_ae
=
re
.
compile
(
"
\\\\
AE
\\
{?
\\
}?"
)
re_latex_uppercase_e
=
re
.
compile
(
"
\\\\
[
\"
H'`~^vu=k]
\\
{?E
\\
}?"
)
re_latex_uppercase_i
=
re
.
compile
(
"
\\\\
[
\"
H'`~^vu=k]
\\
{?I
\\
}?"
)
re_latex_uppercase_o
=
re
.
compile
(
"
\\\\
[
\"
H'`~^vu=k]
\\
{?O
\\
}?"
)
re_latex_uppercase_u
=
re
.
compile
(
"
\\\\
[
\"
H'`~^vu=k]
\\
{?U
\\
}?"
)
re_latex_uppercase_y
=
re
.
compile
(
"
\\\\
[
\"
']
\\
{?Y
\\
}?"
)
re_latex_uppercase_c
=
re
.
compile
(
"
\\\\
['uc]
\\
{?C
\\
}?"
)
re_latex_uppercase_n
=
re
.
compile
(
"
\\\\
[c'~^vu]
\\
{?N
\\
}?"
)
def
lower_index_term
(
term
):
"""
Return safely lowered index term TERM. This is done by converting
to UTF-8 first, because standard Python lower() function is not
UTF-8 safe. To be called by both the search engine and the
indexer when appropriate (e.g. before stemming).
In case of problems with UTF-8 compliance, this function raises
UnicodeDecodeError, so the client code may want to catch it.
"""
return
unicode
(
term
,
'utf-8'
)
.
lower
()
.
encode
(
'utf-8'
)
latex_markup_re
=
re
.
compile
(
r"\\begin(\[.+?\])?\{.+?\}|\\end\{.+?}|\\\w+(\[.+?\])?\{(?P<inside1>.*?)\}|\{\\\w+ (?P<inside2>.*?)\}"
)
def
remove_latex_markup
(
phrase
):
ret_phrase
=
''
index
=
0
for
match
in
latex_markup_re
.
finditer
(
phrase
):
ret_phrase
+=
phrase
[
index
:
match
.
start
()]
ret_phrase
+=
match
.
group
(
'inside1'
)
or
match
.
group
(
'inside2'
)
or
''
index
=
match
.
end
()
ret_phrase
+=
phrase
[
index
:]
return
ret_phrase
def
apply_stemming
(
word
,
stemming_language
):
"""Returns word after applying stemming (if stemming language is set).
You can change your stemming language in database.
@param word: word to be checked
@type word: str
@param stemming_language: abbreviation of language or None
@type stemming_language: str
"""
if
stemming_language
:
word
=
stem
(
word
,
stemming_language
)
return
word
def
remove_stopwords
(
word
,
stopwords_kb
=
None
):
"""Returns word after stopword check.
One must specify the name of the knowledge base.
@param word: word to be checked
@type word: str
@param stopwords_kb: name of the stopwords knowledge base
@type word: str
"""
if
stopwords_kb
is
not
None
:
if
is_stopword
(
word
,
stopwords_kb
):
return
""
return
word
def
length_check
(
word
):
"""Returns word after length check.
@param word: word to be checked
@type word: str
"""
if
len
(
word
)
<
CFG_BIBINDEX_MIN_WORD_LENGTH
:
return
""
return
word
def
wash_index_term
(
term
,
max_char_length
=
50
,
lower_term
=
True
):
"""
Return washed form of the index term TERM that would be suitable
for storing into idxWORD* tables. I.e., lower the TERM if
LOWER_TERM is True, and truncate it safely to MAX_CHAR_LENGTH
UTF-8 characters (meaning, in principle, 4*MAX_CHAR_LENGTH bytes).
The function works by an internal conversion of TERM, when needed,
from its input Python UTF-8 binary string format into Python
Unicode format, and then truncating it safely to the given number
of UTF-8 characters, without possible mis-truncation in the middle
of a multi-byte UTF-8 character that could otherwise happen if we
would have been working with UTF-8 binary representation directly.
Note that MAX_CHAR_LENGTH corresponds to the length of the term
column in idxINDEX* tables.
"""
if
lower_term
:
washed_term
=
unicode
(
term
,
'utf-8'
)
.
lower
()
else
:
washed_term
=
unicode
(
term
,
'utf-8'
)
if
len
(
washed_term
)
<=
max_char_length
:
# no need to truncate the term, because it will fit
# nicely even if it uses four-byte UTF-8 characters
return
washed_term
.
encode
(
'utf-8'
)
else
:
# truncate the term in a safe position:
return
washed_term
[:
max_char_length
]
.
encode
(
'utf-8'
)
def
wash_author_name
(
p
):
"""
Wash author name suitable for author searching. Notably, replace
dots and hyphens with spaces, and collapse spaces.
"""
if
re_pattern_author_canonical_id
.
search
(
p
):
# we have canonical author ID form, so ignore all washing
return
p
out
=
re_pattern_fuzzy_author_dots
.
sub
(
" "
,
p
)
out
=
re_pattern_fuzzy_author_spaces
.
sub
(
" "
,
out
)
return
out
.
strip
()
Event Timeline
Log In to Comment