Page MenuHomec4science

search_engine_query_parser.py
No OneTemporary

File Metadata

Created
Mon, Jun 3, 10:37

search_engine_query_parser.py

# -*- coding: utf-8 -*-
## This file is part of Invenio.
## Copyright (C) 2008, 2010, 2011, 2012 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
# pylint: disable=C0301
"""Invenio Search Engine query parsers."""
import re
import string
from datetime import datetime
try:
import dateutil
if not hasattr(dateutil, '__version__') or dateutil.__version__ != '2.0':
from dateutil import parser as du_parser
from dateutil.relativedelta import relativedelta as du_delta
GOT_DATEUTIL = True
else:
from warnings import warn
warn("Not using dateutil module because the version %s is not compatible with Python-2.x" % dateutil.__version__)
GOT_DATEUTIL = False
except ImportError:
# Ok, no date parsing is possible, but continue anyway,
# since this package is only recommended, not mandatory.
GOT_DATEUTIL = False
from invenio.bibindex_engine_tokenizer import BibIndexFuzzyNameTokenizer as FNT
from invenio.logicutils import to_cnf
from invenio.config import CFG_WEBSEARCH_SPIRES_SYNTAX
NameScanner = FNT()
class InvenioWebSearchMismatchedParensError(Exception):
"""Exception for parse errors caused by mismatched parentheses."""
def __init__(self, message):
"""Initialization."""
self.message = message
def __str__(self):
"""String representation."""
return repr(self.message)
class SearchQueryParenthesisedParser(object):
"""Search query parser that handles arbitrarily-nested parentheses
Parameters:
* substitution_dict: a dictionary mapping strings to other strings. By
default, maps 'and', 'or' and 'not' to '+', '|', and '-'. Dictionary
values will be treated as valid operators for output.
A note (valkyrie 25.03.2011):
Based on looking through the prod search logs, it is evident that users,
when they are using parentheses to do searches, only run word characters
up against parens when they intend the parens to be part of the word (e.g.
U(1)), and when they are using parentheses to combine operators, they put
a space before and after them. As of writing, this is the behavior that
SQPP now expects, in order that it be able to handle such queries as
e(+)e(-) that contain operators in parentheses that should be interpreted
as words.
"""
def __init__(self, substitution_dict = {'and': '+', 'or': '|', 'not': '-'}):
self.substitution_dict = substitution_dict
self.specials = set(['(', ')', '+', '|', '-', '+ -'])
self.__tl_idx = 0
self.__tl_len = 0
# I think my names are both concise and clear
# pylint: disable=C0103
def _invenio_to_python_logical(self, q):
"""Translate the + and - in invenio query strings into & and ~."""
p = q
p = re.sub('\+ -', '&~', p)
p = re.sub('\+', '&', p)
p = re.sub('-', '~', p)
p = re.sub(' ~', ' & ~', p)
return p
def _python_logical_to_invenio(self, q):
"""Translate the & and ~ in logical expression strings into + and -."""
p = q
p = re.sub('\& ~', '-', p)
p = re.sub('~', '-', p)
p = re.sub('\&', '+', p)
return p
# pylint: enable=C0103
def parse_query(self, query):
"""Make query into something suitable for search_engine.
This is the main entry point of the class.
Given an expression of the form:
"expr1 or expr2 (expr3 not (expr4 or expr5))"
produces annoted list output suitable for consumption by search_engine,
of the form:
['+', 'expr1', '|', 'expr2', '+', 'expr3 - expr4 | expr5']
parse_query() is a wrapper for self.tokenize() and self.parse().
"""
toklist = self.tokenize(query)
depth, balanced, dummy_d0_p = self.nesting_depth_and_balance(toklist)
if not balanced:
raise SyntaxError("Mismatched parentheses in "+str(toklist))
toklist, var_subs = self.substitute_variables(toklist)
if depth > 1:
toklist = self.tokenize(self.logically_reduce(toklist))
return self.parse(toklist, var_subs)
def substitute_variables(self, toklist):
"""Given a token list, return a copy of token list in which all free
variables are bound with boolean variable names of the form 'pN'.
Additionally, all the substitutable logical operators are exchanged
for their symbolic form and implicit ands are made explicit
e.g., ((author:'ellis, j' and title:quark) or author:stevens jones)
becomes:
((p0 + p1) | p2 + p3)
with the substitution table:
{'p0': "author:'ellis, j'", 'p1': "title:quark",
'p2': "author:stevens", 'p3': "jones" }
Return value is the substituted token list and a copy of the
substitution table.
"""
def labels():
i = 0
while True:
yield 'p'+str(i)
i += 1
def filter_front_ands(toklist):
"""Filter out extra logical connectives and whitespace from the front."""
while toklist[0] == '+' or toklist[0] == '|' or toklist[0] == '':
toklist = toklist[1:]
return toklist
var_subs = {}
labeler = labels()
new_toklist = ['']
cannot_be_anded = self.specials.difference((')',))
for token in toklist:
token = token.lower()
if token in self.substitution_dict:
if token == 'not' and new_toklist[-1] == '+':
new_toklist[-1] = '-'
else:
new_toklist.append(self.substitution_dict[token])
elif token == '(':
if new_toklist[-1] not in self.specials:
new_toklist.append('+')
new_toklist.append(token)
elif token not in self.specials:
# apparently generators are hard for pylint to figure out
# Turns off msg about labeler not having a 'next' method
# pylint: disable=E1101
label = labeler.next()
# pylint: enable=E1101
var_subs[label] = token
if new_toklist[-1] not in cannot_be_anded:
new_toklist.append('+')
new_toklist.append(label)
else:
if token == '-' and new_toklist[-1] == '+':
new_toklist[-1] = '-'
else:
new_toklist.append(token)
return filter_front_ands(new_toklist), var_subs
def nesting_depth_and_balance(self, token_list):
"""Checks that parentheses are balanced and counts how deep they nest"""
depth = 0
maxdepth = 0
depth0_pairs = 0
good_depth = True
for i in range(len(token_list)):
token = token_list[i]
if token == '(':
if depth == 0:
depth0_pairs += 1
depth += 1
if depth > maxdepth:
maxdepth += 1
elif token == ')':
depth -= 1
if depth == -1: # can only happen with unmatched )
good_depth = False # so force depth check to fail
depth = 0 # but keep maxdepth in good range
return maxdepth, depth == 0 and good_depth, depth0_pairs
def logically_reduce(self, token_list):
"""Return token_list in conjunctive normal form as a string.
CNF has the property that there will only ever be one level of
parenthetical nesting, and all distributable operators (such as
the not in -(p | q) will be fully distributed (as -p + -q).
"""
maxdepth, dummy_balanced, d0_p = self.nesting_depth_and_balance(token_list)
s = ' '.join(token_list)
s = self._invenio_to_python_logical(s)
last_maxdepth = 0
while maxdepth != last_maxdepth: # XXX: sometimes NaryExpr doesn't
try: # fully flatten Expr; but it usually
s = str(to_cnf(s)) # does in 2 passes FIXME: diagnose
except SyntaxError:
raise SyntaxError(str(s)+" couldn't be converted to a logic expression.")
last_maxdepth = maxdepth
maxdepth, dummy_balanced, d0_p = self.nesting_depth_and_balance(self.tokenize(s))
if d0_p == 1 and s[0] == '(' and s[-1] == ')': # s can come back with extra parens
s = s[1:-1]
s = self._python_logical_to_invenio(s)
return s
def tokenize(self, query):
"""Given a query string, return a list of tokens from that string.
* Isolates meaningful punctuation: ( ) + | -
* Keeps single- and double-quoted strings together without interpretation.
* Splits everything else on whitespace.
i.e.:
"expr1|expr2 (expr3-(expr4 or expr5))"
becomes:
['expr1', '|', 'expr2', '(', 'expr3', '-', '(', 'expr4', 'or', 'expr5', ')', ')']
special case:
"e(+)e(-)" interprets '+' and '-' as word characters since they are in parens with
word characters run up against them.
it becomes:
['e(+)e(-)']
"""
###
# Invariants:
# * Query is never modified
# * In every loop iteration, querytokens grows to the right
# * The only return point is at the bottom of the function, and the only
# return value is querytokens
###
def get_tokens(s):
"""
Given string s, return a list of s's tokens.
Adds space around special punctuation, then splits on whitespace.
"""
s = ' '+s
s = s.replace('->', '####DATE###RANGE##OP#') # XXX: Save '->'
s = re.sub('(?P<outside>[a-zA-Z0-9_,=:]+)\((?P<inside>[a-zA-Z0-9_,+-/]*)\)',
'#####\g<outside>####PAREN###\g<inside>##PAREN#', s) # XXX: Save U(1) and SL(2,Z)
s = re.sub('####PAREN###(?P<content0>[.0-9/-]*)(?P<plus>[+])(?P<content1>[.0-9/-]*)##PAREN#',
'####PAREN###\g<content0>##PLUS##\g<content1>##PAREN#', s)
s = re.sub('####PAREN###(?P<content0>([.0-9/]|##PLUS##)*)(?P<minus>[-])' +\
'(?P<content1>([.0-9/]|##PLUS##)*)##PAREN#',
'####PAREN###\g<content0>##MINUS##\g<content1>##PAREN#', s) # XXX: Save e(+)e(-)
for char in self.specials:
if char == '-':
s = s.replace(' -', ' - ')
s = s.replace(')-', ') - ')
s = s.replace('-(', ' - (')
else:
s = s.replace(char, ' '+char+' ')
s = re.sub('##PLUS##', '+', s)
s = re.sub('##MINUS##', '-', s) # XXX: Restore e(+)e(-)
s = re.sub('#####(?P<outside>[a-zA-Z0-9_,=:]+)####PAREN###(?P<inside>[a-zA-Z0-9_,+-/]*)##PAREN#',
'\g<outside>(\g<inside>)', s) # XXX: Restore U(1) and SL(2,Z)
s = s.replace('####DATE###RANGE##OP#', '->') # XXX: Restore '->'
return s.split()
querytokens = []
current_position = 0
re_quotes_match = re.compile(r'(?![\\])(".*?[^\\]")' + r"|(?![\\])('.*?[^\\]')")
for match in re_quotes_match.finditer(query):
match_start = match.start()
quoted_region = match.group(0).strip()
# clean the content after the previous quotes and before current quotes
unquoted = query[current_position : match_start]
querytokens.extend(get_tokens(unquoted))
# XXX: In case we end up with e.g. title:, "compton scattering", make it
# title:"compton scattering"
if querytokens and querytokens[0] and querytokens[-1][-1] == ':':
querytokens[-1] += quoted_region
# XXX: In case we end up with e.g. "expr1",->,"expr2", make it
# "expr1"->"expr2"
elif len(querytokens) >= 2 and querytokens[-1] == '->':
arrow = querytokens.pop()
querytokens[-1] += arrow + quoted_region
else:
# add our newly tokenized content to the token list
querytokens.extend([quoted_region])
# move current position to the end of the tokenized content
current_position = match.end()
# get tokens from the last appearance of quotes until the query end
unquoted = query[current_position : len(query)]
querytokens.extend(get_tokens(unquoted))
return querytokens
def parse(self, token_list, variable_substitution_dict=None):
"""Make token_list consumable by search_engine.
Turns a list of tokens and a variable mapping into a grouped list
of subexpressions in the format suitable for use by search_engine,
e.g.:
['+', 'searchterm', '-', 'searchterm to exclude', '|', 'another term']
Incidentally, this works recursively so parens can cause arbitrarily
deep nestings. But since the search_engine doesn't know about nested
structures, we need to flatten the input structure first.
"""
###
# Invariants:
# * Token list is never modified
# * Balanced parens remain balanced; unbalanced parens are an error
# * Individual tokens may only be exchanged for items in the variable
# substitution dict; otherwise they pass through unmolested
# * Return value is built up mostly as a stack
###
op_symbols = self.substitution_dict.values()
self.__tl_idx = 0
self.__tl_len = len(token_list)
def inner_parse(token_list, open_parens=False):
'''
although it's not in the API, it seems sensible to comment
this function a bit.
dist_token here is a token (e.g. a second-order operator)
which needs to be distributed across other tokens inside
the inner parens
'''
if open_parens:
parsed_values = []
else:
parsed_values = ['+']
i = 0
while i < len(token_list):
token = token_list[i]
if i > 0 and parsed_values[-1] not in op_symbols:
parsed_values.append('+')
if token == '(':
# if we need to distribute something over the tokens inside the parens
# we will know it because... it will end in a :
# that part of the list will be 'px', '+', '('
distributing = (len(parsed_values) > 2 and parsed_values[-2].endswith(':') and parsed_values[-1] == '+')
if distributing:
# we don't need the + if we are distributing
parsed_values = parsed_values[:-1]
offset = self.__tl_len - len(token_list)
inner_value = inner_parse(token_list[i+1:], True)
inner_value = ' '.join(inner_value)
if distributing:
if len(self.tokenize(inner_value)) == 1:
parsed_values[-1] = parsed_values[-1] + inner_value
elif "'" in inner_value:
parsed_values[-1] = parsed_values[-1] + '"' + inner_value + '"'
elif '"' in inner_value:
parsed_values[-1] = parsed_values[-1] + "'" + inner_value + "'"
else:
parsed_values[-1] = parsed_values[-1] + '"' + inner_value + '"'
else:
parsed_values.append(inner_value)
self.__tl_idx += 1
i = self.__tl_idx - offset
elif token == ')':
if parsed_values[-1] in op_symbols:
parsed_values = parsed_values[:-1]
if len(parsed_values) > 1 and parsed_values[0] == '+' and parsed_values[1] in op_symbols:
parsed_values = parsed_values[1:]
return parsed_values
elif token in op_symbols:
if len(parsed_values) > 0:
parsed_values[-1] = token
else:
parsed_values = [token]
else:
if variable_substitution_dict != None and token in variable_substitution_dict:
token = variable_substitution_dict[token]
parsed_values.append(token)
i += 1
self.__tl_idx += 1
# If we have an extra start symbol, remove the default one
if parsed_values[1] in op_symbols:
parsed_values = parsed_values[1:]
return parsed_values
return inner_parse(token_list, False)
class SpiresToInvenioSyntaxConverter:
"""Converts queries defined with SPIRES search syntax into queries
that use Invenio search syntax.
"""
# Constants defining fields
_DATE_ADDED_FIELD = 'datecreated:'
_DATE_UPDATED_FIELD = 'datemodified:'
_DATE_FIELD = 'year:'
_A_TAG = 'author:'
_EA_TAG = 'exactauthor:'
# Dictionary containing the matches between SPIRES keywords
# and their corresponding Invenio keywords or fields
# SPIRES keyword : Invenio keyword or field
_SPIRES_TO_INVENIO_KEYWORDS_MATCHINGS = {
# address
'address' : 'address:',
# affiliation
'affiliation' : 'affiliation:',
'affil' : 'affiliation:',
'aff' : 'affiliation:',
'af' : 'affiliation:',
'institution' : 'affiliation:',
'inst' : 'affiliation:',
# any field
'any' : 'anyfield:',
# author count
'ac' : 'authorcount:',
# bulletin
'bb' : 'reportnumber:',
'bbn' : 'reportnumber:',
'bull' : 'reportnumber:',
'bulletin-bd' : 'reportnumber:',
'bulletin-bd-no' : 'reportnumber:',
'eprint' : 'reportnumber:',
# citation / reference
'c' : 'reference:',
'citation' : 'reference:',
'cited' : 'reference:',
'jour-vol-page' : 'reference:',
'jvp' : 'reference:',
# collaboration
'collaboration' : 'collaboration:',
'collab-name' : 'collaboration:',
'cn' : 'collaboration:',
# conference number
'conf-number' : '111__g:',
'cnum' : '773__w:',
# country
'cc' : '044__a:',
'country' : '044__a:',
# date
'date': _DATE_FIELD,
'd': _DATE_FIELD,
# date added
'date-added': _DATE_ADDED_FIELD,
'dadd': _DATE_ADDED_FIELD,
'da': _DATE_ADDED_FIELD,
# date updated
'date-updated': _DATE_UPDATED_FIELD,
'dupd': _DATE_UPDATED_FIELD,
'du': _DATE_UPDATED_FIELD,
# first author
'fa' : 'firstauthor:',
'first-author' : 'firstauthor:',
# author
'a' : 'author:',
'au' : 'author:',
'author' : 'author:',
'name' : 'author:',
# exact author
# this is not a real keyword match. It is pseudo keyword that
# will be replaced later with author search
'ea' : 'exactauthor:',
'exact-author' : 'exactauthor:',
# experiment
'exp' : 'experiment:',
'experiment' : 'experiment:',
'expno' : 'experiment:',
'sd' : 'experiment:',
'se' : 'experiment:',
# journal
'journal' : 'journal:',
'j' : 'journal:',
'published_in' : 'journal:',
'spicite' : 'journal:',
'vol' : 'journal:',
# journal page
'journal-page' : '773__c:',
'jp' : '773__c:',
# journal year
'journal-year' : '773__y:',
'jy' : '773__y:',
# key
'key' : '970__a:',
'irn' : '970__a:',
'record' : '970__a:',
'document' : '970__a:',
'documents' : '970__a:',
# keywords
'k' : 'keyword:',
'keywords' : 'keyword:',
'kw' : 'keyword:',
# note
'note' : '500__a:',
# old title
'old-title' : '246__a:',
'old-t' : '246__a:',
'ex-ti' : '246__a:',
'et' : '246__a:',
#postal code
'postalcode' : 'postalcode:',
'zip' : 'postalcode:',
'cc' : 'postalcode:',
# ppf subject
'ppf-subject' : '650__a:',
'status' : '650__a:',
# recid
'recid' : 'recid:',
# report number
'r' : 'reportnumber:',
'rn' : 'reportnumber:',
'rept' : 'reportnumber:',
'report' : 'reportnumber:',
'report-num' : 'reportnumber:',
# title
't' : 'title:',
'ti' : 'title:',
'title' : 'title:',
'with-language' : 'title:',
# fulltext
'fulltext' : 'fulltext:',
'ft' : 'fulltext:',
# topic
'topic' : '695__a:',
'tp' : '695__a:',
'hep-topic' : '695__a:',
'desy-keyword' : '695__a:',
'dk' : '695__a:',
# topcite
'topcit' : 'cited:',
'topcite' : 'cited:',
# captions
'caption' : 'caption:',
# category
'arx' : '037__c:',
'category' : '037__c:',
# primarch
'parx' : '037__c:',
'primarch' : '037__c:',
# texkey
'texkey' : '035__z:',
# type code
'tc' : 'collection:',
'ty' : 'collection:',
'type' : 'collection:',
'type-code' : 'collection:',
'scl': 'collection:',
'ps': 'collection:',
# field code
'f' : 'subject:',
'fc' : 'subject:',
'field' : 'subject:',
'field-code' : 'subject:',
'subject' : 'subject:',
# coden
'bc' : 'journal:',
'browse-only-indx' : 'journal:',
'coden' : 'journal:',
'journal-coden' : 'journal:',
# jobs specific codes
'job' : 'title:',
'position' : 'title:',
'region' : 'region:',
'continent' : 'region:',
'deadline' : '046__a:',
'rank' : 'rank:',
# replace all the keywords without match with empty string
# this will remove the noise from the unknown keywrds in the search
# and will in all fields for the words following the keywords
# energy
'e' : '',
'energy' : '',
'energyrange-code' : '',
# exact experiment number
'ee' : '',
'exact-exp' : '',
'exact-expno' : '',
# hidden note
'hidden-note' : '',
'hn' : '',
# ppf
'ppf' : '',
'ppflist' : '',
# slac topics
'ppfa' : '',
'slac-topics' : '',
'special-topics' : '',
'stp' : '',
# test index
'test' : '',
'testindex' : '',
}
_SECOND_ORDER_KEYWORD_MATCHINGS = {
'refersto' : 'refersto:',
'refs': 'refersto:',
'citedby' : 'citedby:'
}
_INVENIO_KEYWORDS_FOR_SPIRES_PHRASE_SEARCHES = [
'affiliation:',
#'cited:', # topcite is technically a phrase index - this isn't necessary
'773__y:', # journal-year
'773__c:', # journal-page
'773__w:', # cnum
'044__a:', # country code
'subject:', # field code
'collection:', # type code
'035__z:', # texkey
# also exact expno, corp-auth, url, abstract, doi, mycite, citing
# but we have no invenio equivalents for these ATM
]
def __init__(self):
"""Initialize the state of the converter"""
self._months = {}
self._month_name_to_month_number = {}
self._init_months()
self._compile_regular_expressions()
def _compile_regular_expressions(self):
"""Compiles some of the regular expressions that are used in the class
for higher performance."""
# regular expression that matches the contents in single and double quotes
# taking in mind if they are escaped.
self._re_quotes_match = re.compile(r'(?![\\])(".*?[^\\]")' + r"|(?![\\])('.*?[^\\]')")
# match cases where a keyword distributes across a conjunction
self._re_distribute_keywords = re.compile(r'''(?ix) # verbose, ignorecase on
\b(?P<keyword>\S*:) # a keyword is anything that's not whitespace with a colon
(?P<content>[^:]+?)\s* # content is the part that comes after the keyword; it should NOT
# have colons in it! that implies that we might be distributing
# a keyword OVER another keyword. see ticket #701
(?P<combination>\ and\ not\ |\ and\ |\ or\ |\ not\ )\s*
(?P<last_content>[^:]*?) # oh look, content without a keyword!
(?=\ and\ |\ or\ |\ not\ |$)''')
# massaging SPIRES quirks
self._re_pattern_IRN_search = re.compile(r'970__a:(?P<irn>\d+)')
self._re_topcite_match = re.compile(r'(?P<x>cited:\d+)\+')
# regular expression that matches author patterns
# and author patterns with second-order-ops on top
# does not match names with " or ' around them, since
# those should not be touched
self._re_author_match = re.compile(r'''(?ix) # verbose, ignorecase
\b((?P<secondorderop>[^\s]+:)?) # do we have a second-order-op on top?
((?P<first>first)?)author:(?P<name>
[^\'\"] # first character not a quotemark
[^()]*? # some stuff that isn't parentheses (that is dealt with in pp)
[^\'\"]) # last character not a quotemark
(?=\ and\ not\ |\ and\ |\ or\ |\ not\ |$)''')
# regular expression that matches exact author patterns
# the group defined in this regular expression is used in method
# _convert_spires_exact_author_search_to_invenio_author_search(...)
# in case of changes correct also the code in this method
self._re_exact_author_match = re.compile(r'\b((?P<secondorderop>[^\s]+:)?)exactauthor:(?P<author_name>[^\'\"].*?[^\'\"]\b)(?= and not | and | or | not |$)', re.IGNORECASE)
# match a second-order operator with no operator following it
self._re_second_order_op_no_index_match = re.compile(r'''(?ix) # ignorecase, verbose
(^|\b|:)(?P<second_order_op>(refersto|citedby):)
(?P<search_terms>[^\"\'][^:]+?) # anything without an index should be absorbed here
\s*
(?P<conjunction_or_next_keyword>(\ and\ |\ not\ |\ or\ |\ \w+:\w+|$))
''')
# match search term, its content (words that are searched) and
# the operator preceding the term.
self._re_search_term_pattern_match = re.compile(r'\b(?P<combine_operator>find|and|or|not)\s+(?P<search_term>\S+:)(?P<search_content>.+?)(?= and not | and | or | not |$)', re.IGNORECASE)
# match journal searches
self._re_search_term_is_journal = re.compile(r'''(?ix) # verbose, ignorecase
\b(?P<leading>(find|and|or|not)\s+journal:) # first combining operator and index
(?P<search_content>.+?) # what we are searching
(?=\ and\ not\ |\ and\ |\ or\ |\ not\ |$)''')
# regular expression matching date after pattern
self._re_date_after_match = re.compile(r'\b(?P<searchop>d|date|dupd|dadd|da|date-added|du|date-updated)\b\s*(after|>)\s*(?P<search_content>.+?)(?= and not | and | or | not |$)', re.IGNORECASE)
# regular expression matching date after pattern
self._re_date_before_match = re.compile(r'\b(?P<searchop>d|date|dupd|dadd|da|date-added|du|date-updated)\b\s*(before|<)\s*(?P<search_content>.+?)(?= and not | and | or | not |$)', re.IGNORECASE)
# match date searches which have been keyword-substituted
self._re_keysubbed_date_expr = re.compile(r'\b(?P<term>(' + self._DATE_ADDED_FIELD + ')|(' + self._DATE_UPDATED_FIELD + ')|(' + self._DATE_FIELD + '))(?P<content>.+?)(?= and not | and | or | not |$)', re.IGNORECASE)
# for finding (and changing) a variety of different SPIRES search keywords
self._re_spires_find_keyword = re.compile('^(f|fin|find)\s+', re.IGNORECASE)
# for finding boolean expressions
self._re_boolean_expression = re.compile(r' and | or | not | and not ')
# patterns for subbing out spaces within quotes temporarily
self._re_pattern_single_quotes = re.compile("'(.*?)'")
self._re_pattern_double_quotes = re.compile("\"(.*?)\"")
self._re_pattern_regexp_quotes = re.compile("\/(.*?)\/")
self._re_pattern_space = re.compile("__SPACE__")
self._re_pattern_equals = re.compile("__EQUALS__")
def is_applicable(self, query):
"""Is this converter applicable to this query?
Return true if query begins with find, fin, or f, or if it contains
a SPIRES-specific keyword (a, t, etc.), or if it contains the invenio
author: field search. """
if not CFG_WEBSEARCH_SPIRES_SYNTAX:
#SPIRES syntax is switched off
return False
query = query.lower()
if self._re_spires_find_keyword.match(query):
#leading 'find' is present and SPIRES syntax is switched on
return True
if CFG_WEBSEARCH_SPIRES_SYNTAX > 1:
for word in query.split(' '):
if self._SPIRES_TO_INVENIO_KEYWORDS_MATCHINGS.has_key(word):
return True
return False
def convert_query(self, query):
"""Convert SPIRES syntax queries to Invenio syntax.
Do nothing to queries not in SPIRES syntax."""
# SPIRES syntax allows searches with 'find' or 'fin'.
if self.is_applicable(query):
query = re.sub(self._re_spires_find_keyword, 'find ', query)
if not query.startswith('find'):
query = 'find ' + query
# a holdover from SPIRES syntax is e.g. date = 2000 rather than just date 2000
query = self._remove_extraneous_equals_signs(query)
# these calls are before keywords replacement because when keywords
# are replaced, date keyword is replaced by specific field search
# and the DATE keyword is not match in DATE BEFORE or DATE AFTER
query = self._convert_spires_date_before_to_invenio_span_query(query)
query = self._convert_spires_date_after_to_invenio_span_query(query)
# call to _replace_spires_keywords_with_invenio_keywords should be at the
# beginning because the next methods use the result of the replacement
query = self._standardize_already_invenio_keywords(query)
query = self._replace_spires_keywords_with_invenio_keywords(query)
query = self._normalise_journal_page_format(query)
query = self._distribute_keywords_across_combinations(query)
query = self._distribute_and_quote_second_order_ops(query)
query = self._convert_dates(query)
query = self._convert_irns_to_spires_irns(query)
query = self._convert_topcite_to_cited(query)
query = self._convert_spires_author_search_to_invenio_author_search(query)
query = self._convert_spires_exact_author_search_to_invenio_author_search(query)
query = self._convert_spires_truncation_to_invenio_truncation(query)
query = self._expand_search_patterns(query)
# remove FIND in the beginning of the query as it is not necessary in Invenio
query = query[4:]
query = query.strip()
return query
def _init_months(self):
"""Defines a dictionary matching the name
of the month with its corresponding number"""
# this dictionary is used when generating match patterns for months
self._months = {'jan':'01', 'january':'01',
'feb':'02', 'february':'02',
'mar':'03', 'march':'03',
'apr':'04', 'april':'04',
'may':'05', 'may':'05',
'jun':'06', 'june':'06',
'jul':'07', 'july':'07',
'aug':'08', 'august':'08',
'sep':'09', 'september':'09',
'oct':'10', 'october':'10',
'nov':'11', 'november':'11',
'dec':'12', 'december':'12'}
# this dictionary is used to transform name of the month
# to a number used in the date format. By this reason it
# contains also the numbers itself to simplify the conversion
self._month_name_to_month_number = {'1':'01', '01':'01',
'2':'02', '02':'02',
'3':'03', '03':'03',
'4':'04', '04':'04',
'5':'05', '05':'05',
'6':'06', '06':'06',
'7':'07', '07':'07',
'8':'08', '08':'08',
'9':'09', '09':'09',
'10':'10',
'11':'11',
'12':'12',}
# combine it with months in order to cover all the cases
self._month_name_to_month_number.update(self._months)
def _get_month_names_match(self):
"""Retruns part of a patter that matches month in a date"""
months_match = ''
for month_name in self._months.keys():
months_match = months_match + month_name + '|'
months_match = r'\b(' + months_match[0:-1] + r')\b'
return months_match
def _convert_dates(self, query):
"""Tries to find dates in query and make them look like ISO-8601."""
def mangle_with_dateutils(query):
DEFAULT = datetime(datetime.today().year, 1, 1)
result = ''
position = 0
for match in self._re_keysubbed_date_expr.finditer(query):
result += query[position : match.start()]
isodates = []
dates = match.group('content').split('->') # Warning: generalizing but should only ever be 2 items
for datestamp in dates:
if datestamp != None:
if re.match('[0-9]{1,4}$', datestamp):
isodates.append(datestamp)
else:
units = 0
datestamp = re.sub('yesterday', datetime.strftime(datetime.today()
+du_delta(days=-1), '%Y-%m-%d'),
datestamp)
datestamp = re.sub('today', datetime.strftime(datetime.today(), '%Y-%m-%d'), datestamp)
datestamp = re.sub('this week', datetime.strftime(datetime.today()
+du_delta(days=-(datetime.today().isoweekday()%7)), '%Y-%m-%d'),
datestamp)
datestamp = re.sub('last week', datetime.strftime(datetime.today()
+du_delta(days=-((datetime.today().isoweekday()%7)+7)), '%Y-%m-%d'),
datestamp)
datestamp = re.sub('this month', datetime.strftime(datetime.today(), '%Y-%m'),
datestamp)
datestamp = re.sub('last month', datetime.strftime(datetime.today()
+du_delta(months=-1), '%Y-%m'),
datestamp)
datemath = re.match(r'(?P<datestamp>.+)\s+(?P<operator>[-+])\s+(?P<units>\d+)', datestamp)
if datemath:
datestamp = datemath.group('datestamp')
units += int(datemath.group('operator') + datemath.group('units'))
try:
dtobj = du_parser.parse(datestamp, default=DEFAULT)
dtobj = dtobj + du_delta(days=units)
if dtobj.day == 1:
isodates.append("%d-%02d" % (dtobj.year, dtobj.month))
else:
isodates.append("%d-%02d-%02d" % (dtobj.year, dtobj.month, dtobj.day))
except ValueError:
isodates.append(datestamp)
daterange = '->'.join(isodates)
result += match.group('term') + daterange
position = match.end()
result += query[position : ]
return result
if GOT_DATEUTIL:
query = mangle_with_dateutils(query)
# else do nothing with the dates
return query
def _convert_irns_to_spires_irns(self, query):
"""Prefix IRN numbers with SPIRES- so they match the INSPIRE format."""
def create_replacement_pattern(match):
"""method used for replacement with regular expression"""
return '970__a:SPIRES-' + match.group('irn')
query = self._re_pattern_IRN_search.sub(create_replacement_pattern, query)
return query
def _convert_topcite_to_cited(self, query):
"""Replace SPIRES topcite x+ with cited:x->999999999"""
def create_replacement_pattern(match):
"""method used for replacement with regular expression"""
return match.group('x') + '->999999999'
query = self._re_topcite_match.sub(create_replacement_pattern, query)
return query
def _convert_spires_date_after_to_invenio_span_query(self, query):
"""Converts date after SPIRES search term into invenio span query"""
def create_replacement_pattern(match):
"""method used for replacement with regular expression"""
return match.group('searchop') + ' ' + match.group('search_content') + '->9999'
query = self._re_date_after_match.sub(create_replacement_pattern, query)
return query
def _convert_spires_date_before_to_invenio_span_query(self, query):
"""Converts date before SPIRES search term into invenio span query"""
# method used for replacement with regular expression
def create_replacement_pattern(match):
return match.group('searchop') + ' ' + '0->' + match.group('search_content')
query = self._re_date_before_match.sub(create_replacement_pattern, query)
return query
def _expand_search_patterns(self, query):
"""Expands search queries.
If a search term is followed by several words e.g.
author:ellis or title:THESE THREE WORDS it is expanded to
author:ellis or (title:THESE and title:THREE...)
All keywords are thus expanded. XXX: this may lead to surprising
results for any later parsing stages if we're not careful.
"""
def create_replacements(term, content):
result = ''
content = content.strip()
# replace spaces within quotes by __SPACE__ temporarily:
content = self._re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", content)
content = self._re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", content)
content = self._re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", content)
if term in self._INVENIO_KEYWORDS_FOR_SPIRES_PHRASE_SEARCHES \
and not self._re_boolean_expression.search(content) and ' ' in content:
# the case of things which should be searched as phrases
result = term + '"' + content + '"'
else:
words = content.split()
if len(words) == 0:
# this should almost never happen, req user to say 'find a junk:'
result = term
elif len(words) == 1:
# this is more common but still occasional
result = term + words[0]
else:
# general case
result = '(' + term + words[0]
for word in words[1:]:
result += ' and ' + term + word
result += ')'
# replace back __SPACE__ by spaces:
result = self._re_pattern_space.sub(" ", result)
return result.strip()
result = ''
current_position = 0
for match in self._re_search_term_pattern_match.finditer(query):
result += query[current_position : match.start()]
result += ' ' + match.group('combine_operator') + ' '
result += create_replacements(match.group('search_term'), match.group('search_content'))
current_position = match.end()
result += query[current_position : len(query)]
return result.strip()
def _remove_extraneous_equals_signs(self, query):
"""In SPIRES, both date = 2000 and date 2000 are acceptable. Get rid of the ="""
query = self._re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), '=', '__EQUALS__')+"'", query)
query = self._re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), '=', '__EQUALS__')+'\"', query)
query = self._re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), '=', '__EQUALS__')+"/", query)
query = query.replace('=', '')
query = self._re_pattern_equals.sub("=", query)
return query
def _convert_spires_truncation_to_invenio_truncation(self, query):
"""Replace SPIRES truncation symbol # with invenio trancation symbol *"""
return query.replace('#', '*')
def _convert_spires_exact_author_search_to_invenio_author_search(self, query):
"""Converts SPIRES search patterns for exact author into search pattern
for invenio"""
# method used for replacement with regular expression
def create_replacement_pattern(match):
# the regular expression where this group name is defined is in
# the method _compile_regular_expressions()
return self._EA_TAG + '"' + match.group('author_name') + '"'
query = self._re_exact_author_match.sub(create_replacement_pattern, query)
return query
def _convert_spires_author_search_to_invenio_author_search(self, query):
"""Converts SPIRES search patterns for authors to search patterns in invenio
that give similar results to the spires search.
"""
# result of the replacement
result = ''
current_position = 0
for match in self._re_author_match.finditer(query):
result += query[current_position : match.start() ]
if match.group('secondorderop'):
result += match.group('secondorderop')
scanned_name = NameScanner.scan(match.group('name'))
author_atoms = self._create_author_search_pattern_from_fuzzy_name_dict(scanned_name)
if match.group('first'):
author_atoms = author_atoms.replace('author:', 'firstauthor:')
if author_atoms.find(' ') == -1:
result += author_atoms + ' '
else:
result += '(' + author_atoms + ') '
current_position = match.end()
result += query[current_position : len(query)]
return result
def _create_author_search_pattern_from_fuzzy_name_dict(self, fuzzy_name):
"""Creates an invenio search pattern for an author from a fuzzy name dict"""
author_name = ''
author_middle_name = ''
author_surname = ''
full_search = ''
if len(fuzzy_name['nonlastnames']) > 0:
author_name = fuzzy_name['nonlastnames'][0]
if len(fuzzy_name['nonlastnames']) == 2:
author_middle_name = fuzzy_name['nonlastnames'][1]
if len(fuzzy_name['nonlastnames']) > 2:
author_middle_name = ' '.join(fuzzy_name['nonlastnames'][1:])
if fuzzy_name['raw']:
full_search = fuzzy_name['raw']
author_surname = ' '.join(fuzzy_name['lastnames'])
NAME_IS_INITIAL = (len(author_name) == 1)
NAME_IS_NOT_INITIAL = not NAME_IS_INITIAL
# we expect to have at least surname
if author_surname == '' or author_surname == None:
return ''
# ellis ---> "author:ellis"
#if author_name == '' or author_name == None:
if not author_name:
return self._A_TAG + author_surname
# ellis, j ---> "ellis, j*"
if NAME_IS_INITIAL and not author_middle_name:
return self._A_TAG + '"' + author_surname + ', ' + author_name + '*"'
# if there is middle name we expect to have also name and surname
# ellis, j. r. ---> ellis, j* r*
# j r ellis ---> ellis, j* r*
# ellis, john r. ---> ellis, j* r* or ellis, j. r. or ellis, jo. r.
# ellis, john r. ---> author:ellis, j* r* or exactauthor:ellis, j r or exactauthor:ellis jo r
if author_middle_name:
search_pattern = self._A_TAG + '"' + author_surname + ', ' + author_name + '*' + ' ' + author_middle_name.replace(" ","* ") + '*"'
if NAME_IS_NOT_INITIAL:
for i in range(1, len(author_name)):
search_pattern += ' or ' + self._EA_TAG + "\"%s, %s %s\"" % (author_surname, author_name[0:i], author_middle_name)
return search_pattern
# ellis, jacqueline ---> "ellis, jacqueline" or "ellis, j.*" or "ellis, j" or "ellis, ja.*" or "ellis, ja" or "ellis, jacqueline *, ellis, j *"
# in case we don't use SPIRES data, the ending dot is ommited.
search_pattern = self._A_TAG + '"' + author_surname + ', ' + author_name + '*"'
search_pattern += " or " + self._EA_TAG + "\"%s, %s *\"" % (author_surname, author_name[0])
if NAME_IS_NOT_INITIAL:
for i in range(1,len(author_name)):
search_pattern += ' or ' + self._EA_TAG + "\"%s, %s\"" % (author_surname, author_name[0:i])
search_pattern += ' or %s"%s, *"' % (self._A_TAG, full_search)
return search_pattern
def _normalise_journal_page_format(self, query):
"""Phys.Lett, 0903, 024 -> Phys.Lett,0903,024"""
def _is_triple(search):
return (len(re.findall('\s+', search)) + len(re.findall(':', search))) == 2
def _normalise_spaces_and_colons_to_commas_in_triple(search):
if not _is_triple(search):
return search
search = re.sub(',\s+', ',', search)
search = re.sub('\s+', ',', search)
search = re.sub(':', ',', search)
return search
result = ""
current_position = 0
for match in self._re_search_term_is_journal.finditer(query):
result += query[current_position : match.start()]
result += match.group('leading')
search = match.group('search_content')
search = _normalise_spaces_and_colons_to_commas_in_triple(search)
result += search
current_position = match.end()
result += query[current_position : ]
return result
def _standardize_already_invenio_keywords(self, query):
"""Replaces invenio keywords kw with "and kw" in order to
parse them correctly further down the line."""
unique_invenio_keywords = set(self._SPIRES_TO_INVENIO_KEYWORDS_MATCHINGS.values()) |\
set(self._SECOND_ORDER_KEYWORD_MATCHINGS.values())
unique_invenio_keywords.remove('') # for the ones that don't have invenio equivalents
for invenio_keyword in unique_invenio_keywords:
query = re.sub("(?<!... \+|... -| and |. or | not |....:)"+invenio_keyword, "and "+invenio_keyword, query)
query = re.sub("\+"+invenio_keyword, "and "+invenio_keyword, query)
query = re.sub("-"+invenio_keyword, "and not "+invenio_keyword, query)
return query
def _replace_spires_keywords_with_invenio_keywords(self, query):
"""Replaces SPIRES keywords that have directly
corresponding Invenio keywords
Replacements are done only in content that is not in quotes."""
# result of the replacement
result = ""
current_position = 0
for match in self._re_quotes_match.finditer(query):
# clean the content after the previous quotes and before current quotes
cleanable_content = query[current_position : match.start()]
cleanable_content = self._replace_all_spires_keywords_in_string(cleanable_content)
# get the content in the quotes (group one matches double
# quotes, group 2 singles)
if match.group(1):
quoted_content = match.group(1)
elif match.group(2):
quoted_content = match.group(2)
# append the processed content to the result
result = result + cleanable_content + quoted_content
# move current position at the end of the processed content
current_position = match.end()
# clean the content from the last appearance of quotes till the end of the query
cleanable_content = query[current_position : len(query)]
cleanable_content = self._replace_all_spires_keywords_in_string(cleanable_content)
result = result + cleanable_content
return result
def _replace_all_spires_keywords_in_string(self, query):
"""Replaces all SPIRES keywords in the string with their
corresponding Invenio keywords"""
for spires_keyword, invenio_keyword in self._SPIRES_TO_INVENIO_KEYWORDS_MATCHINGS.iteritems():
query = self._replace_keyword(query, spires_keyword, invenio_keyword)
for spires_keyword, invenio_keyword in self._SECOND_ORDER_KEYWORD_MATCHINGS.iteritems():
query = self._replace_second_order_keyword(query, spires_keyword, invenio_keyword)
return query
def _replace_keyword(self, query, old_keyword, new_keyword):
"""Replaces old keyword in the query with a new keyword"""
regex_string = r'(?P<operator>(^find|\band|\bor|\bnot|\brefersto|\bcitedby|^)\b[:\s\(]*)' + \
old_keyword + r'(?P<end>[\s\(]+|$)'
regular_expression = re.compile(regex_string, re.IGNORECASE)
result = regular_expression.sub(r'\g<operator>' + new_keyword + r'\g<end>', query)
result = re.sub(':\s+', ':', result)
return result
def _replace_second_order_keyword(self, query, old_keyword, new_keyword):
"""Replaces old second-order keyword in the query with a new keyword"""
regular_expression =\
re.compile(r'''(?ix) # verbose, ignorecase
(?P<operator>
(^find|\band|\bor|\bnot|\brefersto|\bcitedby|^)\b # operator preceding our operator
[:\s\(]* # trailing colon, spaces, parens, etc. for that operator
)
%s # the keyword we're searching for
(?P<endorop>
\s*[a-z]+:| # either an operator (like author:)
[\s\(]+| # or a paren opening
$ # or the end of the string
)''' % old_keyword)
result = regular_expression.sub(r'\g<operator>' + new_keyword + r'\g<endorop>', query)
result = re.sub(':\s+', ':', result)
return result
def _distribute_keywords_across_combinations(self, query):
"""author:ellis and james -> author:ellis and author:james"""
# method used for replacement with regular expression
def create_replacement_pattern(match):
return match.group('keyword') + match.group('content') + \
match.group('combination') + match.group('keyword') + \
match.group('last_content')
still_matches = True
while still_matches:
query = self._re_distribute_keywords.sub(create_replacement_pattern, query)
still_matches = self._re_distribute_keywords.search(query)
query = re.sub(r'\s+', ' ', query)
return query
def _distribute_and_quote_second_order_ops(self, query):
"""refersto:s parke -> refersto:\"s parke\""""
def create_replacement_pattern(match):
return match.group('second_order_op') + '"' +\
match.group('search_terms') + '"' +\
match.group('conjunction_or_next_keyword')
for match in self._re_second_order_op_no_index_match.finditer(query):
query = self._re_second_order_op_no_index_match.sub(create_replacement_pattern, query)
query = re.sub(r'\s+', ' ', query)
return query

Event Timeline