diff --git a/modules/websearch/lib/search_engine_query_parser.py b/modules/websearch/lib/search_engine_query_parser.py index 0ce806034..eff25cfa3 100644 --- a/modules/websearch/lib/search_engine_query_parser.py +++ b/modules/websearch/lib/search_engine_query_parser.py @@ -1,972 +1,972 @@ # -*- coding: utf-8 -*- ## This file is part of Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2010 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable=C0301 """Invenio Search Engine query parsers.""" import re import string from datetime import datetime try: import dateutil.parser GOT_DATEUTIL = True except ImportError: # Ok, no date parsing is possible, but continue anyway, # since this package is only recommended, not mandatory. GOT_DATEUTIL = False from invenio.bibindex_engine_tokenizer import BibIndexFuzzyNameTokenizer as FNT from invenio.logicutils import to_cnf NameScanner = FNT() class InvenioWebSearchMismatchedParensError(Exception): """Exception for parse errors caused by mismatched parentheses.""" def __init__(self, message): """Initialization.""" self.message = message def __str__(self): """String representation.""" return repr(self.message) class SearchQueryParenthesisedParser(object): """Search query parser that handles arbitrarily-nested parentheses Parameters: * substitution_dict: a dictionary mapping strings to other strings. By default, maps 'and', 'or' and 'not' to '+', '|', and '-'. Dictionary values will be treated as valid operators for output. """ def __init__(self, substitution_dict = {'and': '+', 'or': '|', 'not': '-'}): self.substitution_dict = substitution_dict self.specials = set(['(', ')', '+', '|', '-', '+ -']) self.__tl_idx = 0 self.__tl_len = 0 # I think my names are both concise and clear # pylint: disable=C0103 def _invenio_to_python_logical(self, q): """Translate the + and - in invenio query strings into & and ~.""" p = q p = re.sub('\+ -', '&~', p) p = re.sub('\+', '&', p) p = re.sub('-', '~', p) p = re.sub(' ~', ' & ~', p) return p def _python_logical_to_invenio(self, q): """Translate the & and ~ in logical expression strings into + and -.""" p = q p = re.sub('\& ~', '-', p) p = re.sub('~', '-', p) p = re.sub('\&', '+', p) return p # pylint: enable=C0103 def parse_query(self, query): """Make query into something suitable for search_engine. This is the main entry point of the class. Given an expression of the form: "expr1 or expr2 (expr3 not (expr4 or expr5))" produces annoted list output suitable for consumption by search_engine, of the form: ['+', 'expr1', '|', 'expr2', '+', 'expr3 - expr4 | expr5'] parse_query() is a wrapper for self.tokenize() and self.parse(). """ toklist = self.tokenize(query) depth, balanced, d0_p = self.nesting_depth_and_balance(toklist) if not balanced: raise SyntaxError("Mismatched parentheses in "+str(toklist)) toklist, var_subs = self.substitute_variables(toklist) if depth > 1: toklist = self.tokenize(self.logically_reduce(toklist)) return self.parse(toklist, var_subs) def substitute_variables(self, toklist): """Given a token list, return a copy of token list in which all free variables are bound with boolean variable names of the form 'pN'. Additionally, all the substitutable logical operators are exchanged for their symbolic form and implicit ands are made explicit e.g., ((author:'ellis, j' and title:quark) or author:stevens jones) becomes: ((p0 + p1) | p2 + p3) with the substitution table: {'p0': "author:'ellis, j'", 'p1': "title:quark", 'p2': "author:stevens", 'p3': "jones" } Return value is the substituted token list and a copy of the substitution table. """ def labels(): i = 0 while True: yield 'p'+str(i) i += 1 def filter_front_ands(toklist): """Filter out extra logical connectives and whitespace from the front.""" while toklist[0] == '+' or toklist[0] == '|' or toklist[0] == '': toklist = toklist[1:] return toklist var_subs = {} labeler = labels() new_toklist = [''] cannot_be_anded = self.specials.difference((')',)) for token in toklist: token = token.lower() if token in self.substitution_dict: if token == 'not' and new_toklist[-1] == '+': - new_toklist[-1] = '+ -' + new_toklist[-1] = '-' else: new_toklist.append(self.substitution_dict[token]) elif token == '(': if new_toklist[-1] not in self.specials: new_toklist.append('+') new_toklist.append(token) elif token not in self.specials: # apparently generators are hard for pylint to figure out # Turns off msg about labeler not having a 'next' method # pylint: disable=E1101 label = labeler.next() # pylint: enable=E1101 var_subs[label] = token if new_toklist[-1] not in cannot_be_anded: new_toklist.append('+') new_toklist.append(label) else: if token == '-' and new_toklist[-1] == '+': - new_toklist[-1] = '+ -' + new_toklist[-1] = '-' else: new_toklist.append(token) return filter_front_ands(new_toklist), var_subs def nesting_depth_and_balance(self, token_list): """Checks that parentheses are balanced and counts how deep they nest""" depth = 0 maxdepth = 0 depth0_pairs = 0 good_depth = True for i in range(len(token_list)): token = token_list[i] if token == '(': if depth == 0: depth0_pairs += 1 depth += 1 if depth > maxdepth: maxdepth += 1 elif token == ')': depth -= 1 if depth == -1: # can only happen with unmatched ) good_depth = False # so force depth check to fail depth = 0 # but keep maxdepth in good range return maxdepth, depth == 0 and good_depth, depth0_pairs def logically_reduce(self, token_list): """Return token_list in conjunctive normal form as a string. CNF has the property that there will only ever be one level of parenthetical nesting, and all distributable operators (such as the not in -(p | q) will be fully distributed (as -p + -q). """ maxdepth, balanced, d0_p = self.nesting_depth_and_balance(token_list) s = ' '.join(token_list) s = self._invenio_to_python_logical(s) last_maxdepth = 0 while maxdepth != last_maxdepth: # XXX: sometimes NaryExpr doesn't try: # fully flatten Expr; but it usually s = str(to_cnf(s)) # does in 2 passes FIXME: diagnose except SyntaxError: raise SyntaxError(str(s)+" couldn't be converted to a logic expression.") last_maxdepth = maxdepth maxdepth, balanced, d0_p = self.nesting_depth_and_balance(self.tokenize(s)) if d0_p == 1 and s[0] == '(' and s[-1] == ')': # s can come back with extra parens s = s[1:-1] s = self._python_logical_to_invenio(s) return s def tokenize(self, query): """Given a query string, return a list of tokens from that string. * Isolates meaningful punctuation: ( ) + | - * Keeps single- and doubl-quoted strings together without interpretation. * Splits everything else on whitespace. i.e.: "expr1|expr2 (expr3-(expr4 or expr5))" becomes: ['expr1', '|', 'expr2', '(', 'expr3', '-', '(', 'expr4', 'or', 'expr5', ')', ')'] """ ### # Invariants: # * Query is never modified # * In every loop iteration, querytokens grows to the right # * The only return point is at the bottom of the function, and the only # return value is querytokens ### def get_tokens(s): """ Given string s, return a list of s's tokens. Adds space around special punctuation, then splits on whitespace. """ s = ' '+s s = s.replace('->', '####DATE###RANGE##OP#') # XXX: Save '->' s = re.sub('(?P[a-zA-Z0-9_,]+)\((?P[a-zA-Z0-9_,]*)\)', '#####\g####PAREN###\g##PAREN#', s) # XXX: Save U(1) and SL(2,Z) for char in self.specials: if char == '-': s = s.replace(' -', ' '+char+' ') s = s.replace(')-', ') '+char+' ') s = s.replace('-(', ' '+char+' (') else: s = s.replace(char, ' '+char+' ') s = re.sub('#####(?P[a-zA-Z0-9_,]+)####PAREN###(?P[a-zA-Z0-9_,]*)##PAREN#', '\g(\g)', s) # XXX: Restore U(1) and SL(2,Z) s = s.replace('####DATE###RANGE##OP#', '->') # XXX: Restore '->' return s.split() querytokens = [] current_position = 0 re_quotepairs = re.compile(r'[^\\](".*?[^\\]")|[^\\](\'.*?[^\\]\')') for match in re_quotepairs.finditer(query): # special case for when our regexp captures a single char before # the quotes. This is faster (in development time and code) # than a much more complicated and complete regexp or using an # FSM for quote balancing. XXX: But is there a better way? match_start = match.start() quoted_region = match.group(0).strip() if quoted_region[0] not in "'\"": match_start += 1 # capture char 0 in unquoted below quoted_region = quoted_region[1:] # clean the content after the previous quotes and before current quotes unquoted = query[current_position : match_start] querytokens.extend(get_tokens(unquoted)) # XXX: sometimes we end up with labeled quoted regions split off from their # labels, e.g., 'title:"compton scattering"' becomes # ['title:', '"compton scattering"'] rather than ['title:"compton scattering"'] # This corrects for that. if querytokens[-1][-1] == ':': querytokens[-1] += quoted_region else: # add our newly tokenized content to the token list querytokens.extend([quoted_region]) # move current position to the end of the tokenized content current_position = match.end() # get tokens from the last appearance of quotes until the query end unquoted = query[current_position : len(query)] querytokens.extend(get_tokens(unquoted)) return querytokens def parse(self, token_list, variable_substitution_dict=None): """Make token_list consumable by search_engine. Turns a list of tokens and a variable mapping into a grouped list of subexpressions in the format suitable for use by search_engine, e.g.: ['+', 'searchterm', '-', 'searchterm to exclude', '|', 'another term'] Incidentally, this works recursively so parens can cause arbitrarily deep nestings. But since the search_engine doesn't know about nested structures, we need to flatten the input structure first. """ ### # Invariants: # * Token list is never modified # * Balanced parens remain balanced; unbalanced parens are an error # * Individual tokens may only be exchanged for items in the variable # substitution dict; otherwise they pass through unmolested # * Return value is built up mostly as a stack ### op_symbols = self.substitution_dict.values() self.__tl_idx = 0 self.__tl_len = len(token_list) def inner_parse(token_list, open_parens=False): if open_parens: parsed_values = [] else: parsed_values = ['+'] i = 0 while i < len(token_list): token = token_list[i] if i > 0 and parsed_values[-1] not in op_symbols: parsed_values.append('+') if token == '(': offset = self.__tl_len - len(token_list) inner_value = inner_parse(token_list[i+1:], True) inner_value = ' '.join(inner_value) parsed_values.append(inner_value) self.__tl_idx += 1 i = self.__tl_idx - offset elif token == ')': if parsed_values[-1] in op_symbols: parsed_values = parsed_values[:-1] if parsed_values[0] == '+' and parsed_values[1] in op_symbols: parsed_values = parsed_values[1:] return parsed_values elif token in op_symbols: if len(parsed_values) > 0: parsed_values[-1] = token else: parsed_values = [token] else: if variable_substitution_dict != None and token in variable_substitution_dict: token = variable_substitution_dict[token] parsed_values.append(token) i += 1 self.__tl_idx += 1 # If we have an extra start symbol, remove the default one if parsed_values[1] in op_symbols: parsed_values = parsed_values[1:] return parsed_values return inner_parse(token_list, False) class SpiresToInvenioSyntaxConverter: """Converts queries defined with SPIRES search syntax into queries that use Invenio search syntax. """ # Constants defining fields _DATE_ADDED_FIELD = 'datecreated:' _DATE_UPDATED_FIELD = 'datemodified:' _DATE_FIELD = 'year:' _A_TAG = 'author:' _EA_TAG = 'exactauthor:' # Dictionary containing the matches between SPIRES keywords # and their corresponding Invenio keywords or fields # SPIRES keyword : Invenio keyword or field _SPIRES_TO_INVENIO_KEYWORDS_MATCHINGS = { # affiliation 'affiliation' : 'affiliation:', 'affil' : 'affiliation:', 'aff' : 'affiliation:', 'af' : 'affiliation:', 'institution' : 'affiliation:', 'inst' : 'affiliation:', # any field 'any' : 'anyfield:', # bulletin 'bb' : 'reportnumber:', 'bbn' : 'reportnumber:', 'bull' : 'reportnumber:', 'bulletin-bd' : 'reportnumber:', 'bulletin-bd-no' : 'reportnumber:', 'eprint' : 'reportnumber:', # citation / reference 'c' : 'reference:', 'citation' : 'reference:', 'cited' : 'reference:', 'jour-vol-page' : 'reference:', 'jvp' : 'reference:', # collaboration 'collaboration' : 'collaboration:', 'collab-name' : 'collaboration:', 'cn' : 'collaboration:', # conference number 'conf-number' : '111__g:', 'cnum' : '111__g:', # country 'cc' : '044__a:', 'country' : '044__a:', # date 'date': _DATE_FIELD, 'd': _DATE_FIELD, # date added 'date-added': _DATE_ADDED_FIELD, 'dadd': _DATE_ADDED_FIELD, 'da': _DATE_ADDED_FIELD, # date updated 'date-updated': _DATE_UPDATED_FIELD, 'dupd': _DATE_UPDATED_FIELD, 'du': _DATE_UPDATED_FIELD, # first author 'fa' : '100__a:', 'first-author' : '100__a:', # author 'a' : 'author:', 'au' : 'author:', 'author' : 'author:', 'name' : 'author:', # exact author # this is not a real keyword match. It is pseudo keyword that # will be replaced later with author search 'ea' : 'exactauthor:', 'exact-author' : 'exactauthor:', # experiment 'exp' : 'experiment:', 'experiment' : 'experiment:', 'expno' : 'experiment:', 'sd' : 'experiment:', 'se' : 'experiment:', # journal 'journal' : 'journal:', 'j' : 'journal:', 'published_in' : 'journal:', 'spicite' : 'journal:', 'vol' : 'journal:', # journal page 'journal-page' : '773__c:', 'jp' : '773__c:', # journal year 'journal-year' : '773__y:', 'jy' : '773__y:', # key 'key' : '970__a:', 'irn' : '970__a:', 'record' : '970__a:', 'document' : '970__a:', 'documents' : '970__a:', # keywords 'k' : 'keyword:', 'keywords' : 'keyword:', 'kw' : 'keyword:', # note 'note' : '500__a:', 'n' : '500__a:', # old title 'old-title' : '246__a:', 'old-t' : '246__a:', 'ex-ti' : '246__a:', 'et' : '246__a:', # ppf subject 'ppf-subject' : '650__a:', 'ps' : '650__a:', 'scl' : '650__a:', 'status' : '650__a:', # report number 'r' : 'reportnumber:', 'rn' : 'reportnumber:', 'rept' : 'reportnumber:', 'report' : 'reportnumber:', 'report-num' : 'reportnumber:', # title 't' : 'title:', 'ti' : 'title:', 'title' : 'title:', 'with-language' : 'title:', # fulltext 'fulltext' : 'fulltext:', 'ft' : 'fulltext:', # topic 'topic' : '695__a:', 'tp' : '695__a:', 'hep-topic' : '695__a:', 'desy-keyword' : '695__a:', 'dk' : '695__a:', # second-order operators 'refersto' : 'refersto:', 'refs': 'refersto:', 'citedby' : 'citedby:', # replace all the keywords without match with empty string # this will remove the noise from the unknown keywrds in the search # and will in all fields for the words following the keywords # category 'arx' : '037__c:', 'category' : '037__c:', # primarch 'parx' : '037__c:', 'primarch' : '037__c:', # texkey 'texkey' : '035__z:', # type code 'tc' : '690C_a:', 'ty' : '690C_a:', 'type' : '690C_a:', 'type-code' : '690C_a', # field code 'f' : '65017a:', 'fc' : '65017a:', 'field' : '65017a:', 'field-code' : '65017a:', # coden 'bc' : '', 'browse-only-indx' : '', 'coden' : '', 'journal-coden' : '', # energy 'e' : '', 'energy' : '', 'energyrange-code' : '', # exact experiment number 'ee' : '', 'exact-exp' : '', 'exact-expno' : '', # hidden note 'hidden-note' : '', 'hn' : '', # ppf 'ppf' : '', 'ppflist' : '', # slac topics 'ppfa' : '', 'slac-topics' : '', 'special-topics' : '', 'stp' : '', # test index 'test' : '', 'testindex' : '', } def __init__(self): """Initialize the state of the converter""" self._months = {} self._month_name_to_month_number = {} self._init_months() self._compile_regular_expressions() def _compile_regular_expressions(self): """Compiles some of the regular expressions that are used in the class for higher performance.""" # regular expression that matches the contents in single and double quotes # taking in mind if they are escaped. self._re_quotes_match = re.compile('[^\\\\](".*?[^\\\\]")|[^\\\\](\'.*?[^\\\\]\')') # for matching cases where kw needs distributing self._re_distribute_keywords = re.compile(r'\b(?P\S*:)(?P.+?)\s*(?Pand not | and | or | not )\s*(?P[^:]*?)(?= and not | and | or | not |$)', re.IGNORECASE) # regular expression that matches author patterns self._re_author_match = re.compile(r'\bauthor:\s*(?P.+?)\s*(?= and not | and | or | not |$)', re.IGNORECASE) # regular expression that matches exact author patterns # the group defined in this regular expression is used in method # _convert_spires_exact_author_search_to_invenio_author_search(...) # in case of changes correct also the code in this method self._re_exact_author_match = re.compile(r'\bexactauthor:(?P[^\'\"].*?[^\'\"]\b)(?= and not | and | or | not |$)', re.IGNORECASE) # regular expression that matches search term, its content (words that # are searched) and the operator preceding the term. In case that the # names of the groups defined in the expression are changed, the # chagned should be reflected in the code that use it. self._re_search_term_pattern_match = re.compile(r'\b(?Pfind|and|or|not)\s+(?Ptitle:|keyword:|fulltext:)(?P.+?)(?= and not | and | or | not |$)', re.IGNORECASE) # regular expression matching date after pattern self._re_date_after_match = re.compile(r'\b(?Pd|date|dupd|dadd|da|date-added|du|date-updated)\b\s*(after|>)\s*(?P.+?)(?= and not | and | or | not |$)', re.IGNORECASE) # regular expression matching date after pattern self._re_date_before_match = re.compile(r'\b(?Pd|date|dupd|dadd|da|date-added|du|date-updated)\b\s*(before|<)\s*(?P.+?)(?= and not | and | or | not |$)', re.IGNORECASE) # regular expression that matches date searches which have been # keyword-substituted self._re_keysubbed_date_expr = re.compile(r'\b(?P(' + self._DATE_ADDED_FIELD + ')|(' + self._DATE_UPDATED_FIELD + ')|(' + self._DATE_FIELD + '))\s*(?P.+)(?= and not | and | or | not |$)', re.IGNORECASE) # for finding (and changing) a variety of different SPIRES search keywords self._re_spires_find_keyword = re.compile('^(?Pf|fin|find)\s+(?P.*)$', re.IGNORECASE) # patterns for subbing out spaces within quotes temporarily self._re_pattern_single_quotes = re.compile("'(.*?)'") self._re_pattern_double_quotes = re.compile("\"(.*?)\"") self._re_pattern_regexp_quotes = re.compile("\/(.*?)\/") self._re_pattern_space = re.compile("__SPACE__") self._re_pattern_IRN_search = re.compile(r'970__a:(?P\d+)') def is_applicable(self, query): """Is this converter applicable to this query? Returns True IFF the query starts with SPIRES' 'find' keyword or some acceptable variation thereof.""" if self._re_spires_find_keyword.match(query.lower()): return True else: return False def convert_query(self, query): """Convert SPIRES syntax queries to Invenio syntax. Do nothing to queries not in SPIRES syntax.""" # SPIRES syntax allows searches with 'find' or 'fin'. if self.is_applicable(query): # Everywhere else make the assumption that all and only queries # starting with 'find' are SPIRES queries. Turn fin into find. query = self._re_spires_find_keyword.sub(lambda m: 'find '+m.group('query'), query) # these calls are before keywords replacement becuase when keywords # are replaced, date keyword is replaced by specific field search # and the DATE keyword is not match in DATE BEFORE or DATE AFTER query = self._convert_spires_date_before_to_invenio_span_query(query) query = self._convert_spires_date_after_to_invenio_span_query(query) # call to _replace_spires_keywords_with_invenio_keywords should be at the # beginning because the next methods use the result of the replacement query = self._replace_spires_keywords_with_invenio_keywords(query) query = self._distribute_keywords_across_combinations(query) query = self._convert_dates(query) query = self._convert_irns_to_spires_irns(query) query = self._convert_spires_author_search_to_invenio_author_search(query) query = self._convert_spires_exact_author_search_to_invenio_author_search(query) query = self._convert_spires_truncation_to_invenio_truncation(query) query = self._expand_search_patterns(query) # remove FIND in the beginning of the query as it is not necessary in Invenio query = query[4:] query = query.strip() return query def _init_months(self): """Defines a dictionary matching the name of the month with its corresponding number""" # this dictionary is used when generating match patterns for months self._months = {'jan':'01', 'january':'01', 'feb':'02', 'february':'02', 'mar':'03', 'march':'03', 'apr':'04', 'april':'04', 'may':'05', 'may':'05', 'jun':'06', 'june':'06', 'jul':'07', 'july':'07', 'aug':'08', 'august':'08', 'sep':'09', 'september':'09', 'oct':'10', 'october':'10', 'nov':'11', 'november':'11', 'dec':'12', 'december':'12'} # this dictionary is used to transform name of the month # to a number used in the date format. By this reason it # contains also the numbers itself to simplify the conversion self._month_name_to_month_number = {'1':'01', '01':'01', '2':'02', '02':'02', '3':'03', '03':'03', '4':'04', '04':'04', '5':'05', '05':'05', '6':'06', '06':'06', '7':'07', '07':'07', '8':'08', '08':'08', '9':'09', '09':'09', '10':'10', '11':'11', '12':'12',} # combine it with months in order to cover all the cases self._month_name_to_month_number.update(self._months) def _get_month_names_match(self): """Retruns part of a patter that matches month in a date""" months_match = '' for month_name in self._months.keys(): months_match = months_match + month_name + '|' months_match = r'\b(' + months_match[0:-1] + r')\b' return months_match def _convert_dates(self, query): """Tries to find dates in query and make them look like ISO-8601.""" def mangle_with_dateutils(query): DEFAULT = datetime(datetime.today().year, 1, 1) result = '' position = 0 for match in self._re_keysubbed_date_expr.finditer(query): result += query[position : match.start()] isodates = [] dates = match.group('content').split('->') # Warning: generalizing but should only ever be 2 items for datestamp in dates: if datestamp != None: if re.match('[0-9]{1,4}$', datestamp): isodates.append(datestamp) else: try: dtobj = dateutil.parser.parse(datestamp, default=DEFAULT) if dtobj.day == 1: isodates.append("%d-%02d" % (dtobj.year, dtobj.month)) else: isodates.append("%d-%02d-%02d" % (dtobj.year, dtobj.month, dtobj.day)) except ValueError: isodates.append(datestamp) daterange = '->'.join(isodates) #if re.search('[^\s]+-[^>][^\s]*', daterange): # daterange = '"' + daterange + '"' result += match.group('term') + daterange position = match.end() result += query[position : ] return result if GOT_DATEUTIL: query = mangle_with_dateutils(query) # else do nothing with the dates return query def _convert_irns_to_spires_irns(self, query): """Prefix IRN numbers with SPIRES- so they match the INSPIRE format.""" def create_replacement_pattern(match): """method used for replacement with regular expression""" return '970__a:SPIRES-' + match.group('irn') query = self._re_pattern_IRN_search.sub(create_replacement_pattern, query) return query def _convert_spires_date_after_to_invenio_span_query(self, query): """Converts date after SPIRES search term into invenio span query""" def create_replacement_pattern(match): """method used for replacement with regular expression""" return match.group('searchop') + ' ' + match.group('search_content') + '->9999' query = self._re_date_after_match.sub(create_replacement_pattern, query) return query def _convert_spires_date_before_to_invenio_span_query(self, query): """Converts date before SPIRES search term into invenio span query""" # method used for replacement with regular expression def create_replacement_pattern(match): return match.group('searchop') + ' ' + '0->' + match.group('search_content') query = self._re_date_before_match.sub(create_replacement_pattern, query) return query def _expand_search_patterns(self, query): """Expands search queries. If a search term is followed by several words e.g. author: ellis or title:THESE THREE WORDS it is expanded to author:ellis or (title: THESE and title:THREE...) Not all the search terms are expanded this way, but only title:, keyword:, and fulltext: - see _re_search_term_pattern_match for details. """ def create_replacements(term, content): result = '' content = content.strip() # replace spaces within quotes by __SPACE__ temporarily: content = self._re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", content) content = self._re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", content) content = self._re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", content) words = content.split() if len(words) > 1: result = '(' + term + words[0] for word in words[1:]: result += ' and ' + term + word result += ')' else: result = term + words[0] # replace back __SPACE__ by spaces: result = self._re_pattern_space.sub(" ", result) return result.strip() result = '' current_position = 0 for match in self._re_search_term_pattern_match.finditer(query): result += query[current_position : match.start()] result += ' ' + match.group('combine_operator') + ' ' result += create_replacements(match.group('search_term'), match.group('search_content')) current_position = match.end() result += query[current_position : len(query)] return result.strip() def _convert_spires_truncation_to_invenio_truncation(self, query): """Replace SPIRES truncation symbol # with invenio trancation symbol *""" return query.replace('#', '*') def _convert_spires_exact_author_search_to_invenio_author_search(self, query): """Converts SPIRES search patterns for exact author into search pattern for invenio""" # method used for replacement with regular expression def create_replacement_pattern(match): # the regular expression where this group name is defined is in # the method _compile_regular_expressions() return self._EA_TAG + '"' + match.group('author_name') + '"' query = self._re_exact_author_match.sub(create_replacement_pattern, query) return query def _convert_spires_author_search_to_invenio_author_search(self, query): """Converts SPIRES search patterns for authors to search patterns in invenio that give similar results to the spires search. """ # result of the replacement result = '' current_position = 0 for match in self._re_author_match.finditer(query): result += query[current_position : match.start() ] scanned_name = NameScanner.scan(match.group('name')) author_atoms = self._create_author_search_pattern_from_fuzzy_name_dict(scanned_name) if author_atoms.find(' ') == -1: result += author_atoms + ' ' else: result += '(' + author_atoms + ') ' current_position = match.end() result += query[current_position : len(query)] return result def _create_author_search_pattern_from_fuzzy_name_dict(self, fuzzy_name): """Creates an invenio search pattern for an author from a fuzzy name dict""" author_name = '' author_middle_name = '' author_surname = '' if len(fuzzy_name['nonlastnames']) > 0: author_name = fuzzy_name['nonlastnames'][0] if len(fuzzy_name['nonlastnames']) == 2: author_middle_name = fuzzy_name['nonlastnames'][1] if len(fuzzy_name['nonlastnames']) > 2: author_middle_name = ' '.join(fuzzy_name['nonlastnames'][1:]) author_surname = ' '.join(fuzzy_name['lastnames']) NAME_IS_INITIAL = (len(author_name) == 1) NAME_IS_NOT_INITIAL = not NAME_IS_INITIAL # we expect to have at least surname if author_surname == '' or author_surname == None: return '' # ellis ---> "author:ellis" #if author_name == '' or author_name == None: if not author_name: return self._A_TAG + author_surname # ellis, j ---> "ellis, j*" if NAME_IS_INITIAL and not author_middle_name: return self._A_TAG + '"' + author_surname + ', ' + author_name + '*"' # if there is middle name we expect to have also name and surname # ellis, j. r. ---> ellis, j* r* # j r ellis ---> ellis, j* r* # ellis, john r. ---> ellis, j* r* or ellis, j. r. or ellis, jo. r. # ellis, john r. ---> author:ellis, j* r* or exactauthor:ellis, j r or exactauthor:ellis jo r if author_middle_name: search_pattern = self._A_TAG + '"' + author_surname + ', ' + author_name + '*' + ' ' + author_middle_name.replace(" ","* ") + '*"' if NAME_IS_NOT_INITIAL: for i in range(1, len(author_name)): search_pattern += ' or ' + self._EA_TAG + "\"%s, %s %s\"" % (author_surname, author_name[0:i], author_middle_name) return search_pattern # ellis, jacqueline ---> "ellis, jacqueline" or "ellis, j.*" or "ellis, j" or "ellis, ja.*" or "ellis, ja" or "ellis, jacqueline *, ellis, j *" # in case we don't use SPIRES data, the ending dot is ommited. search_pattern = self._A_TAG + '"' + author_surname + ', ' + author_name + '*"' search_pattern += " or " + self._EA_TAG + "\"%s, %s *\"" % (author_surname, author_name[0]) if NAME_IS_NOT_INITIAL: for i in range(1,len(author_name)): search_pattern += ' or ' + self._EA_TAG + "\"%s, %s\"" % (author_surname, author_name[0:i]) return search_pattern def _replace_spires_keywords_with_invenio_keywords(self, query): """Replaces SPIRES keywords that have directly corresponding Invenio keywords Replacements are done only in content that is not in quotes.""" # result of the replacement result = "" current_position = 0 for match in self._re_quotes_match.finditer(query): # clean the content after the previous quotes and before current quotes cleanable_content = query[current_position : match.start()] cleanable_content = self._replace_all_spires_keywords_in_string(cleanable_content) # get the content in the quotes (group one matches double # quotes, group 2 singles) if match.group(1): quoted_content = match.group(1) elif match.group(2): quoted_content = match.group(2) # append the processed content to the result result = result + cleanable_content + quoted_content # move current position at the end of the processed content current_position = match.end() # clean the content from the last appearance of quotes till the end of the query cleanable_content = query[current_position : len(query)] cleanable_content = self._replace_all_spires_keywords_in_string(cleanable_content) result = result + cleanable_content return result def _replace_all_spires_keywords_in_string(self, query): """Replaces all SPIRES keywords in the string with their corresponding Invenio keywords""" for spires_keyword, invenio_keyword in self._SPIRES_TO_INVENIO_KEYWORDS_MATCHINGS.iteritems(): query = self._replace_keyword(query, spires_keyword, \ invenio_keyword) return query def _replace_keyword(self, query, old_keyword, new_keyword): """Replaces old keyword in the query with a new keyword""" # perform case insensitive replacement with regular expression regex_string = r'\b(?P(find|and|or|not)\b[\s\(]*)' + \ old_keyword + r'(?P[\s\(]+|$)' regular_expression = re.compile(regex_string, re.IGNORECASE) result = regular_expression.sub(r'\g' + new_keyword + r'\g', query) result = re.sub(':\s+', ':', result) return result def _distribute_keywords_across_combinations(self, query): # method used for replacement with regular expression def create_replacement_pattern(match): # the regular expression where this group name is defined is in # the method _compile_regular_expressions() return match.group('keyword') + match.group('content') + \ ' ' + match.group('combination') + ' ' + match.group('keyword') + \ match.group('last_content') query = self._re_distribute_keywords.sub(create_replacement_pattern, query) return query diff --git a/modules/websearch/lib/search_engine_query_parser_tests.py b/modules/websearch/lib/search_engine_query_parser_tests.py index 322bb8b94..c41b49415 100644 --- a/modules/websearch/lib/search_engine_query_parser_tests.py +++ b/modules/websearch/lib/search_engine_query_parser_tests.py @@ -1,587 +1,597 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Unit tests for the search engine query parsers.""" import unittest from invenio import search_engine_query_parser from invenio.testutils import make_test_suite, run_test_suite from invenio.search_engine import create_basic_search_units, perform_request_search try: import dateutil DATEUTIL_AVAILABLE = True except ImportError: DATEUTIL_AVAILABLE = False class TestParserUtilityFunctions(unittest.TestCase): """Test utility functions for the parsing components""" def setUp(self): self.parser = search_engine_query_parser.SearchQueryParenthesisedParser() self.converter = search_engine_query_parser.SpiresToInvenioSyntaxConverter() def test_ndb_simple(self): """SQPP.test_nesting_depth_and_balance: ['p0']""" self.assertEqual((0, True, 0), self.parser.nesting_depth_and_balance(['p0'])) def test_ndb_simple_useful(self): """SQPP.test_nesting_depth_and_balance: ['(', 'p0', ')']""" self.assertEqual((1, True, 1), self.parser.nesting_depth_and_balance(['(', 'p0', ')'])) def test_ndb_slightly_complicated(self): """SQPP.test_nesting_depth_and_balance: ['(', 'p0', ')', '|', '(', 'p2', '+', 'p3', ')']""" self.assertEqual((1, True, 2), self.parser.nesting_depth_and_balance(['(', 'p0', ')', '|', '(', 'p2', '+', 'p3', ')'])) def test_ndb_sorta_hairy(self): """SQPP.test_nesting_depth_and_balance: ['(', '(', ')', ')', '(', '(', '(', ')', ')', ')']""" self.assertEqual((3, True, 2), self.parser.nesting_depth_and_balance(['(', '(', ')', ')', '(', '(', '(', ')', ')', ')'])) def test_ndb_broken_rhs(self): """SQPP.test_nesting_depth_and_balance: ['(', '(', ')', ')', '(', '(', '(', ')', ')', ]""" self.assertEqual((3, False, 2), self.parser.nesting_depth_and_balance(['(', '(', ')', ')', '(', '(', '(', ')', ')', ])) def test_ndb_broken_lhs(self): """SQPP.test_nesting_depth_and_balance: ['(', ')', ')', '(', '(', '(', ')', ')', ')']""" self.assertEqual((3, False, 2), self.parser.nesting_depth_and_balance(['(', ')', ')', '(', '(', '(', ')', ')', ])) def test_stisc(self): """Test whole convert/parse stack: SQPP.parse_query(STISC.convert_query('find a richter, burton and t quark'))""" self.assertEqual(self.parser.parse_query(self.converter.convert_query('find a richter, burton and t quark')), ['+', 'author:"richter, burton*" | exactauthor:"richter, b *" | exactauthor:"richter, b" | exactauthor:"richter, bu" | exactauthor:"richter, bur" | exactauthor:"richter, burt" | exactauthor:"richter, burto"', '+', 'title:quark']) + def test_stisc_not_vs_and_not1(self): + """Parse stack parses "find a ellis, j and not a enqvist" == "find a ellis, j not a enqvist" """ + self.assertEqual(self.parser.parse_query(self.converter.convert_query('find a ellis, j and not a enqvist')), + self.parser.parse_query(self.converter.convert_query('find a ellis, j not a enqvist'))) + + def test_stisc_not_vs_and_not2(self): + """Parse stack parses "find a mangano, m and not a ellis, j" == "find a mangano, m not a ellis, j" """ + self.assertEqual(self.parser.parse_query(self.converter.convert_query('find a mangano, m and not a ellis, j')), + self.parser.parse_query(self.converter.convert_query('find a mangano, m not a ellis, j'))) + class TestSearchQueryParenthesisedParser(unittest.TestCase): """Test parenthesis parsing.""" def setUp(self): self.parser = search_engine_query_parser.SearchQueryParenthesisedParser() def test_sqpp_atom(self): """SearchQueryParenthesisedParser - expr1""" self.assertEqual(self.parser.parse_query('expr1'), ['+', 'expr1']) def test_sqpp_parened_atom(self): """SearchQueryParenthesisedParser - (expr1)""" self.assertEqual(self.parser.parse_query('(expr1)'), ['+', 'expr1']) def test_sqpp_expr1_minus_expr2(self): """SearchQueryParenthesisedParser - expr1 - (expr2)""" self.assertEqual(self.parser.parse_query("expr1 - (expr2)"), ['+', 'expr1', '-', 'expr2']) def test_sqpp_plus_expr1_minus_paren_expr2(self): """SearchQueryParenthesisedParser - + expr1 - (expr2)""" self.assertEqual(self.parser.parse_query("+ expr1 - (expr2)"), ['+', 'expr1', '-', 'expr2']) def test_sqpp_expr1_paren_expr2(self): """SearchQueryParenthesisedParser - expr1 (expr2)""" self.assertEqual(self.parser.parse_query("expr1 (expr2)"), ['+', 'expr1', '+', 'expr2']) def test_sqpp_paren_expr1_minus_expr2(self): """SearchQueryParenthesisedParser - (expr1) - expr2""" self.assertEqual(self.parser.parse_query("(expr1) - expr2"), ['+', 'expr1', '-', 'expr2']) def test_sqpp_paren_expr1_minus_paren_expr2(self): """SearchQueryParenthesisedParser - (expr1)-(expr2)""" self.assertEqual(self.parser.parse_query("(expr1)-(expr2)"), ['+', 'expr1', '-', 'expr2']) def test_sqpp_minus_paren_expr1_minus_paren_expr2(self): """SearchQueryParenthesisedParser - -(expr1)-(expr2)""" self.assertEqual(self.parser.parse_query("-(expr1)-(expr2)"), ['-', 'expr1', '-', 'expr2']) def test_sqpp_paren_expr1_minus_expr2_and_paren_expr3(self): """SearchQueryParenthesisedParser - (expr1) - expr2 + (expr3)""" self.assertEqual(self.parser.parse_query('(expr1) - expr2 + (expr3)'), ['+', 'expr1', '-', 'expr2', '+', 'expr3']) def test_sqpp_paren_expr1_minus_expr2_and_paren_expr3_or_expr4(self): """SearchQueryParenthesisedParser - (expr1) - expr2 + (expr3) | expr4""" self.assertEqual(self.parser.parse_query('(expr1) - expr2 + (expr3) | expr4'), ['+', 'expr1', '-', 'expr2', '+', 'expr3', '|', 'expr4']) #['+', '+ expr1 | expr4', '+', '- expr2 | expr4', '+', '+ expr3 | expr4']) def test_sqpp_paren_expr1_minus_expr2_and_paren_expr3_or_expr4_or_quoted_expr5_and_expr6(self): """SearchQueryParenthesisedParser - (expr1) - expr2 + (expr3) | expr4 | \"expr5 + expr6\"""" self.assertEqual(self.parser.parse_query('(expr1) - expr2 + (expr3 | expr4) | "expr5 + expr6"'), ['+', 'expr1', '-', 'expr2', '+', 'expr3 | expr4', '|', '"expr5 + expr6"']), #['+', '+ expr1 | "expr5 + expr6"', '+', '- expr2 | "expr5 + expr6"', # '+', '+ expr3 | expr4 | "expr5 + expr6"']) def test_sqpp_quoted_expr1_and_paren_expr2_and_expr3(self): """SearchQueryParenthesisedParser - \"expr1\" (expr2) expr3""" self.assertEqual(self.parser.parse_query('"expr1" (expr2) expr3'), ['+', '"expr1"', '+', 'expr2', '+', 'expr3']) def test_sqpp_paren_expr1_expr2_paren_expr3_or_expr4(self): """SearchQueryParenthesisedParser - (expr1) expr2 (expr3) | expr4""" # test parsing of queries with missing operators. # in this case default operator + should be included on place of the missing one self.assertEqual(self.parser.parse_query('(expr1) expr2 (expr3) | expr4'), ['+', 'expr1', '+', 'expr2', '+', 'expr3', '|', 'expr4']) #['+', '+ expr1 | expr4', '+', '+ expr2 | expr4', '+', '+ expr3 | expr4']) def test_sqpp_nested_paren_success(self): """SearchQueryParenthesizedParser - Arbitrarily nested parentheses: ((expr1)) + (expr2 - expr3)""" self.assertEqual(self.parser.parse_query('((expr1)) + (expr2 - expr3)'), ['+', 'expr1', '+', 'expr2', '-', 'expr3']) #['+', 'expr1', '+', 'expr2', '-', 'expr3']) def test_sqpp_nested_paren_really_nested(self): """SearchQueryParenthesisedParser - Nested parentheses where order matters: expr1 - (expr2 - (expr3 | expr4))""" self.assertEqual(self.parser.parse_query('expr1 - (expr2 - (expr3 | expr4))'), ['+', 'expr1', '+', '- expr2 | expr3 | expr4']) def test_sqpp_paren_open_only_failure(self): """SearchQueryParenthesizedParser - Parentheses that only open should raise an exception""" self.failUnlessRaises(SyntaxError, self.parser.parse_query,"(expr") def test_sqpp_paren_close_only_failure(self): """SearchQueryParenthesizedParser - Parentheses that only close should raise an exception""" self.failUnlessRaises(SyntaxError, self.parser.parse_query,"expr)") def test_sqpp_paren_expr1_not_expr2_and_paren_expr3_or_expr4_WORDS(self): """SearchQueryParenthesisedParser - (expr1) not expr2 and (expr3) or expr4""" self.assertEqual(self.parser.parse_query('(expr1) not expr2 and (expr3) or expr4'), ['+', 'expr1', '-', 'expr2', '+', 'expr3', '|', 'expr4']) #['+', '+ expr1 | expr4', '+', '- expr2 | expr4', '+', '+ expr3 | expr4']) def test_sqpp_paren_expr1_not_expr2_or_quoted_string_not_expr3_or_expr4WORDS(self): """SearchQueryParenthesisedParser - (expr1) not expr2 | "expressions not in and quotes | (are) not - parsed " - (expr3) or expr4""" self.assertEqual(self.parser.parse_query('(expr1) not expr2 | "expressions not in and quotes | (are) not - parsed " - (expr3) or expr4'), ['+', 'expr1', '-', 'expr2', '|', '"expressions not in and quotes | (are) not - parsed "', '-', 'expr3', '|', 'expr4']) #['+', '+ "expressions not in and quotes | (are) not - parsed " | expr1 | expr4', # '+', '- expr3 | expr1 | expr4', # '+', '+ "expressions not in and quotes | (are) not - parsed " - expr2 | expr4', # '+', '- expr3 - expr2 | expr4']) def test_sqpp_expr1_escaped_quoted_expr2_and_paren_expr3_not_expr4_WORDS(self): """SearchQueryParenthesisedParser - expr1 \\" expr2 foo(expr3) not expr4 \\" and (expr5)""" self.assertEqual(self.parser.parse_query('expr1 \\" expr2 foo(expr3) not expr4 \\" and (expr5)'), ['+', 'expr1', '+', '\\"', '+', 'expr2', '+', 'foo(expr3)', '-', 'expr4', '+', '\\"', '+', 'expr5']) def test_sqpp_paren_expr1_and_expr2_or_expr3_WORDS(self): """SearchQueryParenthesisedParser - (expr1 and expr2) or expr3""" self.assertEqual(self.parser.parse_query('(expr1 and expr2) or expr3'), ['+', 'expr1 + expr2', '|', 'expr3']) #['+', '+ expr1 | expr3', '+', '+ expr2 | expr3']) def test_sqpp_paren_expr1_and_expr2_or_expr3_WORDS_equiv(self): """SearchQueryParenthesisedParser - (expr1 and expr2) or expr3 == (expr1 + expr2) | expr3""" self.assertEqual(self.parser.parse_query('(expr1 and expr2) or expr3'), self.parser.parse_query('(expr1 + expr2) | expr3')) def test_sqpp_paren_expr1_and_expr2_or_expr3_WORDS_equiv_SYMBOLS(self): """SearchQueryParenthesisedParser - (expr1 and expr2) or expr3 == (expr1 + expr2) or expr3""" self.assertEqual(self.parser.parse_query('(expr1 and expr2) or expr3'), self.parser.parse_query('(expr1 + expr2) or expr3')) def test_sqpp_double_quotes(self): """SearchQueryParenthesisedParser - Test double quotes""" self.assertEqual(self.parser.parse_query( '(expr1) - expr2 | "expressions - in + quotes | (are) not - parsed " - (expr3) | expr4'), ['+', 'expr1', '-', 'expr2', '|', '"expressions - in + quotes | (are) not - parsed "', '-', 'expr3', '|', 'expr4']) #['+', '+ "expressions - in + quotes | (are) not - parsed " | expr1 | expr4', # '+', '- expr3 | expr1 | expr4', # '+', '+ "expressions - in + quotes | (are) not - parsed " - expr2 | expr4', # '+', '- expr3 - expr2 | expr4']) def test_sqpp_single_quotes(self): """SearchQueryParenthesisedParser - Test single quotes""" self.assertEqual(self.parser.parse_query("(expr1) - expr2 | 'expressions - in + quotes | (are) not - parsed ' - (expr3) | expr4"), ['+', 'expr1', '-', 'expr2', '|', "'expressions - in + quotes | (are) not - parsed '", '-', 'expr3', '|', 'expr4']) #['+', '+ \'expressions - in + quotes | (are) not - parsed \' | expr1 | expr4', # '+', '- expr3 | expr1 | expr4', # '+', '+ \'expressions - in + quotes | (are) not - parsed \' - expr2 | expr4', # '+', '- expr3 - expr2 | expr4']) def test_sqpp_escape_single_quotes(self): """SearchQueryParenthesisedParser - Test escaping single quotes""" self.assertEqual(self.parser.parse_query("expr1 \\' expr2 +(expr3) -expr4 \\' + (expr5)"), ['+', 'expr1', '+', "\\'", '+', 'expr2', '+', 'expr3', '-', 'expr4', '+', "\\'", '+', 'expr5']) def test_sqpp_escape_double_quotes(self): """SearchQueryParenthesisedParser - Test escaping double quotes""" self.assertEqual(self.parser.parse_query('expr1 \\" expr2 +(expr3) -expr4 \\" + (expr5)'), ['+', 'expr1', '+', '\\"', '+', 'expr2', '+', 'expr3', '-', 'expr4', '+', '\\"', '+', 'expr5']) def test_sqpp_beginning_double_quotes(self): """SearchQueryParenthesisedParser - Test parsing double quotes at beginning""" self.assertEqual(self.parser.parse_query('"expr1" - (expr2)'), ['+', '"expr1"', '-', 'expr2']) def test_sqpp_beginning_double_quotes_negated(self): """SearchQueryParenthesisedParser - Test parsing negated double quotes at beginning""" self.assertEqual(self.parser.parse_query('-"expr1" - (expr2)'), ['-', '"expr1"', '-', 'expr2']) def test_sqpp_long_or_chain(self): """SearchQueryParenthesisedParser - Test long or chains being parsed flat""" self.assertEqual(self.parser.parse_query('p0 or p1 or p2 or p3 or p4'), ['+', 'p0', '|', 'p1', '|', 'p2', '|', 'p3', '|', 'p4']) def test_sqpp_not_after_recursion(self): """SearchQueryParenthesisedParser - Test operations after recursive calls""" self.assertEqual(self.parser.parse_query('(p0 or p1) not p2'), ['+', 'p0 | p1', '-', 'p2']) #['+', '+ p0 | p1', '-', 'p2']) def test_sqpp_oddly_capped_operators(self): """SearchQueryParenthesisedParser - Test conjunctions in any case""" self.assertEqual(self.parser.parse_query('foo oR bar'), ['+', 'foo', '|', 'bar']) def test_space_before_last_paren(self): """SearchQueryParenthesisedParser - Test (ellis )""" self.assertEqual(self.parser.parse_query('(ellis )'), ['+', 'ellis']) def test_sqpp_nested_U1_or_SL2(self): """SearchQueryParenthesisedParser - Test (U(1) or SL(2,Z))""" self.assertEqual(self.parser.parse_query('(U(1) or SL(2,Z))'), ['+', 'u(1) | sl(2,z)']) def test_sqpp_distributed_ands_equivalent(self): """SearchQueryParenthesisedParser - ellis and (kaluza-klein or r-parity) == ellis and (r-parity or kaluza-klein)""" self.assertEqual(sorted(perform_request_search(p='ellis and (kaluza-klein or r-parity)')), sorted(perform_request_search(p='ellis and (r-parity or kaluza-klein)'))) class TestSpiresToInvenioSyntaxConverter(unittest.TestCase): """Test SPIRES query parsing and translation to Invenio syntax.""" def _compare_searches(self, invenio_syntax, spires_syntax): """Determine if two queries parse to the same search command. For comparison of actual search results (regression testing), see the tests in the Inspire module. """ parser = search_engine_query_parser.SearchQueryParenthesisedParser() converter = search_engine_query_parser.SpiresToInvenioSyntaxConverter() parsed_query = parser.parse_query(converter.convert_query(spires_syntax)) #parse_query removes any parens that convert_query added, but then #we have to rejoin the list it returns and create basic searches result_obtained = create_basic_search_units( None, ' '.join(parsed_query).replace('+ ',''), '', None ) # incase the desired result has parens parsed_wanted = parser.parse_query(invenio_syntax) result_wanted = create_basic_search_units( None, ' '.join(parsed_wanted).replace('+ ',''), '', None) assert result_obtained == result_wanted, \ """SPIRES parsed as %s instead of %s""" % \ (repr(result_obtained), repr(result_wanted)) return def test_operators(self): """SPIRES search syntax - find a ellis and t shapes""" invenio_search = "author:ellis and title:shapes" spires_search = "find a ellis and t shapes" self._compare_searches(invenio_search, spires_search) def test_nots(self): """SPIRES search syntax - find a ellis and not t hadronic and not t collisions""" invenio_search = "author:ellis and not title:hadronic and not title:collisions" spires_search = "find a ellis and not t hadronic and not t collisions" self._compare_searches(invenio_search, spires_search) def test_author_simplest(self): """SPIRES search syntax - find a ellis""" invenio_search = 'author:ellis' spires_search = 'find a ellis' self._compare_searches(invenio_search, spires_search) def test_author_simple(self): """SPIRES search syntax - find a ellis, j""" invenio_search = 'author:"ellis, j*"' spires_search = 'find a ellis, j' self._compare_searches(invenio_search, spires_search) def test_exactauthor_simple(self): """SPIRES search syntax - find ea ellis, j""" invenio_search = 'exactauthor:"ellis, j"' spires_search = 'find ea ellis, j' self._compare_searches(invenio_search, spires_search) def test_author_reverse(self): """SPIRES search syntax - find a j ellis""" invenio_search = 'author:"ellis, j*"' spires_search = 'find a j ellis' self._compare_searches(invenio_search, spires_search) def test_author_initials(self): """SPIRES search syntax - find a a m polyakov""" inv_search = 'author:"polyakov, a* m*"' spi_search = 'find a a m polyakov' self._compare_searches(inv_search, spi_search) def test_author_many_initials(self): """SPIRES search syntax - find a p d q bach""" inv_search = 'author:"bach, p* d* q*"' spi_search = 'find a p d q bach' self._compare_searches(inv_search, spi_search) def test_author_many_lastnames(self): """SPIRES search syntax - find a alvarez gaume, j r r""" inv_search = 'author:"alvarez gaume, j* r* r*"' spi_search = 'find a alvarez gaume, j r r' self._compare_searches(inv_search, spi_search) def test_author_full_initial(self): """SPIRES search syntax - find a klebanov, ig.r.""" inv_search = 'author:"klebanov, ig* r*" or exactauthor:"klebanov, i r"' spi_search = "find a klebanov, ig.r." self._compare_searches(inv_search, spi_search) def test_author_full_first(self): """SPIRES search syntax - find a ellis, john""" invenio_search = 'author:"ellis, john*" or exactauthor:"ellis, j *" or exactauthor:"ellis, j" or exactauthor:"ellis, jo" or exactauthor:"ellis, joh"' spires_search = 'find a ellis, john' self._compare_searches(invenio_search, spires_search) def test_combine_multiple(self): """SPIRES search syntax - find a gattringer, c and k symmetry chiral and not title chiral""" inv_search = 'author:"gattringer, c*" keyword:chiral keyword:symmetry -title:chiral' spi_search = "find a c gattringer and k chiral symmetry and not title chiral" self._compare_searches(inv_search, spi_search) def test_combine_multiple_or(self): """SPIRES search syntax - find a j ellis and (t report or k \"cross section\")""" inv_search = 'author:"ellis, j*" and (title:report or keyword:"cross section")' spi_search = 'find a j ellis and (t report or k "cross section")' self._compare_searches(inv_search, spi_search) def test_irn_processing(self): """SPIRES search syntax - find irn 1360337 == find irn SPIRES-1360337""" # Added for trac-130 with_spires = "fin irn SPIRES-1360337" with_result = perform_request_search(p=with_spires) without_spires = "fin irn 1360337" without_result = perform_request_search(p=without_spires) # We don't care if results are [], as long as they're the same # Uncovered corner case: parsing could be broken and also happen to # return [] twice. Unlikely though. self.assertEqual(with_result, without_result) def test_quotes(self): """SPIRES search syntax - find t 'compton scattering' and a mele""" inv_search = "title:'compton scattering' and author:mele" spi_search = "find t 'compton scattering' and a mele" self._compare_searches(inv_search, spi_search) def test_fin_to_find_trans(self): """SPIRES search syntax - fin a ellis, j == find a ellis, j""" fin_search = "fin a ellis, j" fin_result = perform_request_search(p=fin_search) find_search = "find a ellis, j" find_result = perform_request_search(p=find_search) # We don't care if results are [], as long as they're the same # Uncovered corner case: parsing could be broken and also happen to # return [] twice. Unlikely though. self.assertEqual(fin_result, find_result) def test_distribution_of_notted_search_terms(self): """SPIRES search syntax - find t this and not that ->title:this and not title:that""" spi_search = "find t this and not that" inv_search = "title:this and not title:that" self._compare_searches(inv_search, spi_search) def test_distribution_without_spacing(self): """SPIRES search syntax - find t this and that ->title:this and title:that""" # motivated by trac-187 spi_search = "find aff SLAC and Stanford" inv_search = "affiliation:SLAC and affiliation:Stanford" self._compare_searches(inv_search, spi_search) def test_keyword_as_kw(self): """SPIRES search syntax - find kw something ->keyword:something""" spi_search = "find kw meson" inv_search = "keyword:meson" self._compare_searches(inv_search, spi_search) def test_desy_keyword_translation(self): """SPIRES search syntax - find dk "B --> pi pi" """ spi_search = "find dk \"B --> pi pi\"" inv_search = "695__a:\"B --> pi pi\"" self._compare_searches(inv_search, spi_search) def test_distribution_of_search_terms(self): """SPIRES search syntax - find t this and that ->title:this and title:that""" spi_search = "find t this and that" inv_search = "title:this and title:that" self._compare_searches(inv_search, spi_search) def test_syntax_converter_expand_search_patterns_alone(self): """SPIRES search syntax - simplest expansion""" spi_search = "find t bob sam" inv_search = "title:bob and title:sam" self._compare_searches(inv_search, spi_search) def test_syntax_converter_expand_fulltext(self): """SPIRES search syntax - fulltext support""" spi_search = "find ft The holographic RG is based on" inv_search = "fulltext:The and fulltext:holographic and fulltext:RG and fulltext:is and fulltext:based and fulltext:on" self._compare_searches(inv_search, spi_search) def test_syntax_converter_expand_fulltext_within_larger(self): """SPIRES search syntax - fulltext subsearch support""" spi_search = "find au taylor and ft The holographic RG is based on and t brane" inv_search = "author:taylor fulltext:The and fulltext:holographic and fulltext:RG and fulltext:is and fulltext:based and fulltext:on title:brane" self._compare_searches(inv_search, spi_search) def test_syntax_converter_expand_search_patterns_conjoined(self): """SPIRES search syntax - simplest distribution""" spi_search = "find t bob and sam" inv_search = "title:bob and title:sam" self._compare_searches(inv_search, spi_search) def test_syntax_converter_expand_search_patterns_multiple(self): """SPIRES search syntax - expansion (no distribution)""" spi_search = "find t bob sam and k couch" inv_search = "title:bob and title:sam and keyword:couch" self._compare_searches(inv_search, spi_search) def test_syntax_converter_expand_search_patterns_multiple_conjoined(self): """SPIRES search syntax - distribution and expansion""" spi_search = "find t bob sam and couch" inv_search = "title:bob and title:sam and title:couch" self._compare_searches(inv_search, spi_search) def test_date_by_yr(self): """SPIRES search syntax - searching by date year""" spi_search = "find date 2002" inv_search = "year:2002" self._compare_searches(inv_search, spi_search) def test_date_by_lt_yr(self): """SPIRES search syntax - searching by date < year""" spi_search = "find date < 2002" inv_search = 'year:0->2002' self._compare_searches(inv_search, spi_search) def test_date_by_gt_yr(self): """SPIRES search syntax - searching by date > year""" spi_search = "find date > 1980" inv_search = 'year:1980->9999' self._compare_searches(inv_search, spi_search) def test_date_by_yr_mo(self): """SPIRES search syntax - searching by date 1976-04""" spi_search = "find date 1976-04" inv_search = 'year:1976-04' self._compare_searches(inv_search, spi_search) def test_date_by_eq_yr_mo(self): """SPIRES search syntax - searching by date 1976-04""" spi_search = "find date 1976-04" inv_search = 'year:1976-04' self._compare_searches(inv_search, spi_search) def test_date_by_lt_yr_mo(self): """SPIRES search syntax - searching by date < 1978-10-21""" spi_search = "find date < 1978-10-21" inv_search = 'year:0->1978-10-21' self._compare_searches(inv_search, spi_search) def test_date_by_gt_yr_mo(self): """SPIRES search syntax - searching by date > 1978-10-21""" spi_search = "find date > 1978-10-21" inv_search = 'year:1978-10-21->9999' self._compare_searches(inv_search, spi_search) if DATEUTIL_AVAILABLE: def test_date_by_lt_d_MO_yr(self): """SPIRES search syntax - searching by date < 23 Sep 2010: will only work with dateutil installed""" spi_search = "find date < 23 Sep 2010" inv_search = 'year:0->2010-09-23' self._compare_searches(inv_search, spi_search) def test_date_by_gt_d_MO_yr(self): """SPIRES search syntax - searching by date > 12 Jun 1960: will only work with dateutil installed""" spi_search = "find date > 12 Jun 1960" inv_search = 'year:1960-06-12->9999' self._compare_searches(inv_search, spi_search) def test_spires_syntax_detected_f(self): """SPIRES search syntax - test detection f t p""" # trac #261 converter = search_engine_query_parser.SpiresToInvenioSyntaxConverter() spi_search = converter.is_applicable("f t p") self.assertEqual(spi_search, True) def test_spires_syntax_detected_fin(self): """SPIRES search syntax - test detection fin t p""" # trac #261 converter = search_engine_query_parser.SpiresToInvenioSyntaxConverter() spi_search = converter.is_applicable("fin t p") self.assertEqual(spi_search, True) def test_spires_syntax_detected_find(self): """SPIRES search syntax - test detection find t p""" # trac #261 converter = search_engine_query_parser.SpiresToInvenioSyntaxConverter() spi_search = converter.is_applicable("find t p") self.assertEqual(spi_search, True) def test_spires_syntax_detected_invenio(self): """SPIRES search syntax - test detection Not SPIRES""" # trac #261 converter = search_engine_query_parser.SpiresToInvenioSyntaxConverter() inv_search = converter.is_applicable("t:p a:c") self.assertEqual(inv_search, False) TEST_SUITE = make_test_suite(TestSearchQueryParenthesisedParser, TestSpiresToInvenioSyntaxConverter, TestParserUtilityFunctions) if __name__ == "__main__": run_test_suite(TEST_SUITE) #run_test_suite(make_test_suite(TestParserUtilityFunctions, TestSearchQueryParenthesisedParser)) # DEBUG diff --git a/modules/websearch/lib/websearch_regression_tests.py b/modules/websearch/lib/websearch_regression_tests.py index 213054c7f..aa3894df1 100644 --- a/modules/websearch/lib/websearch_regression_tests.py +++ b/modules/websearch/lib/websearch_regression_tests.py @@ -1,1708 +1,1719 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable=C0301 # pylint: disable=E1102 """WebSearch module regression tests.""" __revision__ = "$Id$" import unittest import re import urlparse, cgi import sys if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 from mechanize import Browser, LinkNotFoundError from invenio.config import CFG_SITE_URL, CFG_SITE_NAME, CFG_SITE_LANG from invenio.testutils import make_test_suite, \ run_test_suite, \ make_url, test_web_page_content, \ merge_error_messages from invenio.urlutils import same_urls_p from invenio.search_engine import perform_request_search, \ guess_primary_collection_of_a_record, guess_collection_of_a_record, \ collection_restricted_p, get_permitted_restricted_collections, \ get_fieldvalues def parse_url(url): parts = urlparse.urlparse(url) query = cgi.parse_qs(parts[4], True) return parts[2].split('/')[1:], query class WebSearchWebPagesAvailabilityTest(unittest.TestCase): """Check WebSearch web pages whether they are up or not.""" def test_search_interface_pages_availability(self): """websearch - availability of search interface pages""" baseurl = CFG_SITE_URL + '/' _exports = ['', 'collection/Poetry', 'collection/Poetry?as=1'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_search_results_pages_availability(self): """websearch - availability of search results pages""" baseurl = CFG_SITE_URL + '/search' _exports = ['', '?c=Poetry', '?p=ellis', '/cache', '/log'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_search_detailed_record_pages_availability(self): """websearch - availability of search detailed record pages""" baseurl = CFG_SITE_URL + '/record/' _exports = ['', '1', '1/', '1/files', '1/files/'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_browse_results_pages_availability(self): """websearch - availability of browse results pages""" baseurl = CFG_SITE_URL + '/search' _exports = ['?p=ellis&f=author&action_browse=Browse'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_help_page_availability(self): """websearch - availability of Help Central page""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help', expected_text="Help Central")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/?ln=fr', expected_text="Centre d'aide")) def test_search_tips_page_availability(self): """websearch - availability of Search Tips""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-tips', expected_text="Search Tips")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-tips?ln=fr', expected_text="Conseils de recherche")) def test_search_guide_page_availability(self): """websearch - availability of Search Guide""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-guide', expected_text="Search Guide")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-guide?ln=fr', expected_text="Guide de recherche")) class WebSearchTestLegacyURLs(unittest.TestCase): """ Check that the application still responds to legacy URLs for navigating, searching and browsing.""" def test_legacy_collections(self): """ websearch - collections handle legacy urls """ browser = Browser() def check(legacy, new, browser=browser): browser.open(legacy) got = browser.geturl() self.failUnless(same_urls_p(got, new), got) # Use the root URL unless we need more check(make_url('/', c=CFG_SITE_NAME), make_url('/', ln=CFG_SITE_LANG)) # Other collections are redirected in the /collection area check(make_url('/', c='Poetry'), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) # Drop unnecessary arguments, like ln and as (when they are # the default value) args = {'as': 0} check(make_url('/', c='Poetry', **args), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) # Otherwise, keep them args = {'as': 1, 'ln': CFG_SITE_LANG} check(make_url('/', c='Poetry', **args), make_url('/collection/Poetry', **args)) # Support the /index.py addressing too check(make_url('/index.py', c='Poetry'), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) def test_legacy_search(self): """ websearch - search queries handle legacy urls """ browser = Browser() def check(legacy, new, browser=browser): browser.open(legacy) got = browser.geturl() self.failUnless(same_urls_p(got, new), got) # /search.py is redirected on /search # Note that `as' is a reserved word in Python 2.5 check(make_url('/search.py', p='nuclear', ln='en') + 'as=1', make_url('/search', p='nuclear', ln='en') + 'as=1') # direct recid searches are redirected to /record check(make_url('/search.py', recid=1, ln='es'), make_url('/record/1', ln='es')) def test_legacy_search_help_link(self): """websearch - legacy Search Help page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/index.en.html', expected_text="Help Central")) def test_legacy_search_tips_link(self): """websearch - legacy Search Tips page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/tips.fr.html', expected_text="Conseils de recherche")) def test_legacy_search_guide_link(self): """websearch - legacy Search Guide page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/guide.en.html', expected_text="Search Guide")) class WebSearchTestRecord(unittest.TestCase): """ Check the interface of the /record results """ def test_format_links(self): """ websearch - check format links for records """ browser = Browser() # We open the record in all known HTML formats for hformat in ('hd', 'hx', 'hm'): browser.open(make_url('/record/1', of=hformat)) if hformat == 'hd': # hd format should have a link to the following # formats for oformat in ('hx', 'hm', 'xm', 'xd'): target = make_url('/record/1/export/%s?ln=en' % oformat) try: browser.find_link(url=target) except LinkNotFoundError: self.fail('link %r should be in page' % target) else: # non-hd HTML formats should have a link back to # the main detailed record target = make_url('/record/1') try: browser.find_link(url=target) except LinkNotFoundError: self.fail('link %r should be in page' % target) return def test_exported_formats(self): """ websearch - check formats exported through /record/1/export/ URLs""" browser = Browser() self.assertEqual([], test_web_page_content(make_url('/record/1/export/hm'), expected_text='245__ $$aALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/hd'), expected_text='ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/xm'), expected_text='ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/xd'), expected_text='ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/hs'), expected_text='ALEPH experiment' % \ CFG_SITE_LANG)) self.assertEqual([], test_web_page_content(make_url('/record/1/export/hx'), expected_text='title = "ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/t?ot=245'), expected_text='245__ $$aALEPH experiment')) self.assertNotEqual([], test_web_page_content(make_url('/record/1/export/t?ot=245'), expected_text='001__')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/h?ot=245'), expected_text='245__ $$aALEPH experiment')) self.assertNotEqual([], test_web_page_content(make_url('/record/1/export/h?ot=245'), expected_text='001__')) return class WebSearchTestCollections(unittest.TestCase): def test_traversal_links(self): """ websearch - traverse all the publications of a collection """ browser = Browser() try: for aas in (0, 1): args = {'as': aas} browser.open(make_url('/collection/Preprints', **args)) for jrec in (11, 21, 11, 28): args = {'jrec': jrec, 'cc': 'Preprints'} if aas: args['as'] = aas url = make_url('/search', **args) try: browser.follow_link(url=url) except LinkNotFoundError: args['ln'] = CFG_SITE_LANG url = make_url('/search', **args) browser.follow_link(url=url) except LinkNotFoundError: self.fail('no link %r in %r' % (url, browser.geturl())) def test_collections_links(self): """ websearch - enter in collections and subcollections """ browser = Browser() def tryfollow(url): cur = browser.geturl() body = browser.response().read() try: browser.follow_link(url=url) except LinkNotFoundError: print body self.fail("in %r: could not find %r" % ( cur, url)) return for aas in (0, 1): if aas: kargs = {'as': 1} else: kargs = {} kargs['ln'] = CFG_SITE_LANG # We navigate from immediate son to immediate son... browser.open(make_url('/', **kargs)) tryfollow(make_url('/collection/Articles%20%26%20Preprints', **kargs)) tryfollow(make_url('/collection/Articles', **kargs)) # But we can also jump to a grandson immediately browser.back() browser.back() tryfollow(make_url('/collection/ALEPH', **kargs)) return def test_records_links(self): """ websearch - check the links toward records in leaf collections """ browser = Browser() browser.open(make_url('/collection/Preprints')) def harvest(): """ Parse all the links in the page, and check that for each link to a detailed record, we also have the corresponding link to the similar records.""" records = set() similar = set() for link in browser.links(): path, q = parse_url(link.url) if not path: continue if path[0] == 'record': records.add(int(path[1])) continue if path[0] == 'search': if not q.get('rm') == ['wrd']: continue recid = q['p'][0].split(':')[1] similar.add(int(recid)) self.failUnlessEqual(records, similar) return records # We must have 10 links to the corresponding /records found = harvest() self.failUnlessEqual(len(found), 10) # When clicking on the "Search" button, we must also have # these 10 links on the records. browser.select_form(name="search") browser.submit() found = harvest() self.failUnlessEqual(len(found), 10) return class WebSearchTestBrowse(unittest.TestCase): def test_browse_field(self): """ websearch - check that browsing works """ browser = Browser() browser.open(make_url('/')) browser.select_form(name='search') browser['f'] = ['title'] browser.submit(name='action_browse') def collect(): # We'll get a few links to search for the actual hits, plus a # link to the following results. res = [] for link in browser.links(url_regex=re.compile(CFG_SITE_URL + r'/search\?')): if link.text == 'Advanced Search': continue dummy, q = parse_url(link.url) res.append((link, q)) return res # if we follow the last link, we should get another # batch. There is an overlap of one item. batch_1 = collect() browser.follow_link(link=batch_1[-1][0]) batch_2 = collect() # FIXME: we cannot compare the whole query, as the collection # set is not equal self.failUnlessEqual(batch_1[-2][1]['p'], batch_2[0][1]['p']) class WebSearchTestOpenURL(unittest.TestCase): def test_isbn_01(self): """ websearch - isbn query via OpenURL 0.1""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl', isbn='0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) def test_isbn_10_rft_id(self): """ websearch - isbn query via OpenURL 1.0 - rft_id""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl', rft_id='urn:ISBN:0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) def test_isbn_10(self): """ websearch - isbn query via OpenURL 1.0""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl?rft.isbn=0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) class WebSearchTestSearch(unittest.TestCase): def test_hits_in_other_collection(self): """ websearch - check extension of a query to the home collection """ browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/collection/ISOLDE', ln='en')) browser.select_form(name='search') browser['f'] = ['author'] browser['p'] = 'matsubara' browser.submit() dummy, current_q = parse_url(browser.geturl()) link = browser.find_link(text_regex=re.compile('.*hit', re.I)) dummy, target_q = parse_url(link.url) # the target query should be the current query without any c # or cc specified. for f in ('cc', 'c', 'action_search'): if f in current_q: del current_q[f] self.failUnlessEqual(current_q, target_q) def test_nearest_terms(self): """ websearch - provide a list of nearest terms """ browser = Browser() browser.open(make_url('')) # Search something weird browser.select_form(name='search') browser['p'] = 'gronf' browser.submit() dummy, original = parse_url(browser.geturl()) for to_drop in ('cc', 'action_search', 'f'): if to_drop in original: del original[to_drop] if 'ln' not in original: original['ln'] = [CFG_SITE_LANG] # we should get a few searches back, which are identical # except for the p field being substituted (and the cc field # being dropped). if 'cc' in original: del original['cc'] for link in browser.links(url_regex=re.compile(CFG_SITE_URL + r'/search\?')): if link.text == 'Advanced Search': continue dummy, target = parse_url(link.url) if 'ln' not in target: target['ln'] = [CFG_SITE_LANG] original['p'] = [link.text] self.failUnlessEqual(original, target) return def test_switch_to_simple_search(self): """ websearch - switch to simple search """ browser = Browser() args = {'as': 1} browser.open(make_url('/collection/ISOLDE', **args)) browser.select_form(name='search') browser['p1'] = 'tandem' browser['f1'] = ['title'] browser.submit() browser.follow_link(text='Simple Search') dummy, q = parse_url(browser.geturl()) self.failUnlessEqual(q, {'cc': ['ISOLDE'], 'p': ['tandem'], 'f': ['title'], 'ln': ['en']}) def test_switch_to_advanced_search(self): """ websearch - switch to advanced search """ browser = Browser() browser.open(make_url('/collection/ISOLDE')) browser.select_form(name='search') browser['p'] = 'tandem' browser['f'] = ['title'] browser.submit() browser.follow_link(text='Advanced Search') dummy, q = parse_url(browser.geturl()) self.failUnlessEqual(q, {'cc': ['ISOLDE'], 'p1': ['tandem'], 'f1': ['title'], 'as': ['1'], 'ln' : ['en']}) def test_no_boolean_hits(self): """ websearch - check the 'no boolean hits' proposed links """ browser = Browser() browser.open(make_url('')) browser.select_form(name='search') browser['p'] = 'quasinormal muon' browser.submit() dummy, q = parse_url(browser.geturl()) for to_drop in ('cc', 'action_search', 'f'): if to_drop in q: del q[to_drop] for bsu in ('quasinormal', 'muon'): l = browser.find_link(text=bsu) q['p'] = bsu if not same_urls_p(l.url, make_url('/search', **q)): self.fail(repr((l.url, make_url('/search', **q)))) def test_similar_authors(self): """ websearch - test similar authors box """ browser = Browser() browser.open(make_url('')) browser.select_form(name='search') browser['p'] = 'Ellis, R K' browser['f'] = ['author'] browser.submit() l = browser.find_link(text="Ellis, R S") self.failUnless(same_urls_p(l.url, make_url('/search', p="Ellis, R S", f='author', ln='en'))) class WebSearchNearestTermsTest(unittest.TestCase): """Check various alternatives of searches leading to the nearest terms box.""" def test_nearest_terms_box_in_okay_query(self): """ websearch - no nearest terms box for a successful query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text="jump to record")) def test_nearest_terms_box_in_unsuccessful_simple_query(self): """ websearch - nearest terms box for unsuccessful simple query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=embed", expected_link_label='embed')) def test_nearest_terms_box_in_unsuccessful_simple_accented_query(self): """ websearch - nearest terms box for unsuccessful accented query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=elliszà', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=embed", expected_link_label='embed')) def test_nearest_terms_box_in_unsuccessful_structured_query(self): """ websearch - nearest terms box for unsuccessful structured query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellisz&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=fabbro&f=author", expected_link_label='fabbro')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3Aellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=author%3Afabbro", expected_link_label='fabbro')) def test_nearest_terms_box_in_unsuccessful_phrase_query(self): """ websearch - nearest terms box for unsuccessful phrase query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis%2C+Z%22', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=author%3A%22Enqvist%2C+K%22", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%22ellisz%22&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%22Enqvist%2C+K%22&f=author", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%22elliszà%22&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%22Enqvist%2C+K%22&f=author", expected_link_label='Enqvist, K')) def test_nearest_terms_box_in_unsuccessful_partial_phrase_query(self): """ websearch - nearest terms box for unsuccessful partial phrase query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%27Ellis%2C+Z%27', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=author%3A%27Enqvist%2C+K%27", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%27ellisz%27&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%27Enqvist%2C+K%27&f=author", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%27elliszà%27&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%27Enqvist%2C+K%27&f=author", expected_link_label='Enqvist, K')) def test_nearest_terms_box_in_unsuccessful_partial_phrase_advanced_query(self): """ websearch - nearest terms box for unsuccessful partial phrase advanced search query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p1=aaa&f1=title&m1=p&as=1', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&f1=title&as=1&p1=A+simple+functional+form+for+proton-nucleus+total+reaction+cross+sections&m1=p", expected_link_label='A simple functional form for proton-nucleus total reaction cross sections')) def test_nearest_terms_box_in_unsuccessful_exact_phrase_advanced_query(self): """ websearch - nearest terms box for unsuccessful exact phrase advanced search query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p1=aaa&f1=title&m1=e&as=1', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&f1=title&as=1&p1=A+simple+functional+form+for+proton-nucleus+total+reaction+cross+sections&m1=e", expected_link_label='A simple functional form for proton-nucleus total reaction cross sections')) def test_nearest_terms_box_in_unsuccessful_boolean_query(self): """ websearch - nearest terms box for unsuccessful boolean query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3Aellisz+author%3Aellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aellisz", expected_link_label='energi')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3Aenergi+author%3Aenergie', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aenqvist", expected_link_label='enqvist')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?ln=en&p=title%3Aellisz+author%3Aellisz&f=keyword', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aellisz&f=keyword", expected_link_label='energi')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?ln=en&p=title%3Aenergi+author%3Aenergie&f=keyword', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aenqvist&f=keyword", expected_link_label='enqvist')) class WebSearchBooleanQueryTest(unittest.TestCase): """Check various boolean queries.""" def test_successful_boolean_query(self): """ websearch - successful boolean query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis+muon', expected_text="records found", expected_link_label="Detailed record")) def test_unsuccessful_boolean_query_where_all_individual_terms_match(self): """ websearch - unsuccessful boolean query where all individual terms match """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis+muon+letter', expected_text="Boolean query returned no hits. Please combine your search terms differently.")) class WebSearchAuthorQueryTest(unittest.TestCase): """Check various author-related queries.""" def test_propose_similar_author_names_box(self): """ websearch - propose similar author names box """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=Ellis%2C+R&f=author', expected_text="See also: similar author names", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=Ellis%2C+R+K&f=author", expected_link_label="Ellis, R K")) def test_do_not_propose_similar_author_names_box(self): """ websearch - do not propose similar author names box """ errmsgs = test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis%2C+R%22', expected_link_target=CFG_SITE_URL+"/search?ln=en&p=Ellis%2C+R+K&f=author", expected_link_label="Ellis, R K") if errmsgs[0].find("does not contain link to") > -1: pass else: self.fail("Should not propose similar author names box.") return class WebSearchSearchEnginePythonAPITest(unittest.TestCase): """Check typical search engine Python API calls on the demo data.""" def test_search_engine_python_api_for_failed_query(self): """websearch - search engine Python API for failed query""" self.assertEqual([], perform_request_search(p='aoeuidhtns')) def test_search_engine_python_api_for_successful_query(self): """websearch - search engine Python API for successful query""" self.assertEqual([8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 47], perform_request_search(p='ellis')) def test_search_engine_python_api_for_existing_record(self): """websearch - search engine Python API for existing record""" self.assertEqual([8], perform_request_search(recid=8)) def test_search_engine_python_api_for_nonexisting_record(self): """websearch - search engine Python API for non-existing record""" self.assertEqual([], perform_request_search(recid=1234567809)) def test_search_engine_python_api_for_nonexisting_collection(self): """websearch - search engine Python API for non-existing collection""" self.assertEqual([], perform_request_search(c='Foo')) def test_search_engine_python_api_for_range_of_records(self): """websearch - search engine Python API for range of records""" self.assertEqual([1, 2, 3, 4, 5, 6, 7, 8, 9], perform_request_search(recid=1, recidb=10)) def test_search_engine_python_api_ranked_by_citation(self): """websearch - search engine Python API for citation ranking""" self.assertEqual([82, 83, 87, 89], perform_request_search(p='recid:81', rm='citation')) def test_search_engine_python_api_textmarc(self): """websearch - search engine Python API for Text MARC output""" # we are testing example from /help/hacking/search-engine-api import cStringIO tmp = cStringIO.StringIO() perform_request_search(req=tmp, p='higgs', of='tm', ot=['100', '700']) out = tmp.getvalue() tmp.close() self.assertEqual(out, """\ 000000085 100__ $$aGirardello, L$$uINFN$$uUniversita di Milano-Bicocca 000000085 700__ $$aPorrati, Massimo 000000085 700__ $$aZaffaroni, A 000000001 100__ $$aPhotolab """) class WebSearchSearchEngineWebAPITest(unittest.TestCase): """Check typical search engine Web API calls on the demo data.""" def test_search_engine_web_api_for_failed_query(self): """websearch - search engine Web API for failed query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=aoeuidhtns&of=id', expected_text="[]")) def test_search_engine_web_api_for_successful_query(self): """websearch - search engine Web API for successful query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis&of=id', expected_text="[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 47]")) def test_search_engine_web_api_for_existing_record(self): """websearch - search engine Web API for existing record""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=8&of=id', expected_text="[8]")) def test_search_engine_web_api_for_nonexisting_record(self): """websearch - search engine Web API for non-existing record""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=123456789&of=id', expected_text="[]")) def test_search_engine_web_api_for_nonexisting_collection(self): """websearch - search engine Web API for non-existing collection""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?c=Foo&of=id', expected_text="[]")) def test_search_engine_web_api_for_range_of_records(self): """websearch - search engine Web API for range of records""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=1&recidb=10&of=id', expected_text="[1, 2, 3, 4, 5, 6, 7, 8, 9]")) class WebSearchRestrictedCollectionTest(unittest.TestCase): """Test of the restricted Theses collection behaviour.""" def test_restricted_collection_interface_page(self): """websearch - restricted collection interface page body""" # there should be no Latest additions box for restricted collections self.assertNotEqual([], test_web_page_content(CFG_SITE_URL + '/collection/Theses', expected_text="Latest additions")) def test_restricted_search_as_anonymous_guest(self): """websearch - restricted collection not searchable by anonymous guest""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') response = browser.response().read() if response.find("If you think you have right to access it, please authenticate yourself.") > -1: pass else: self.fail("Oops, searching restricted collection without password should have redirected to login dialog.") return def test_restricted_search_as_authorized_person(self): """websearch - restricted collection searchable by authorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') browser.select_form(nr=0) browser['p_un'] = 'jekyll' browser['p_pw'] = 'j123ekyll' browser.submit() if browser.response().read().find("records found") > -1: pass else: self.fail("Oops, Dr. Jekyll should be able to search Theses collection.") def test_restricted_search_as_unauthorized_person(self): """websearch - restricted collection not searchable by unauthorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') browser.select_form(nr=0) browser['p_un'] = 'hyde' browser['p_pw'] = 'h123yde' browser.submit() # Mr. Hyde should not be able to connect: if browser.response().read().find("Authorization failure") <= -1: # if we got here, things are broken: self.fail("Oops, Mr.Hyde should not be able to search Theses collection.") def test_restricted_detailed_record_page_as_anonymous_guest(self): """websearch - restricted detailed record page not accessible to guests""" browser = Browser() browser.open(CFG_SITE_URL + '/record/35') if browser.response().read().find("You can use your nickname or your email address to login.") > -1: pass else: self.fail("Oops, searching restricted collection without password should have redirected to login dialog.") return def test_restricted_detailed_record_page_as_authorized_person(self): """websearch - restricted detailed record page accessible to authorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/youraccount/login') browser.select_form(nr=0) browser['p_un'] = 'jekyll' browser['p_pw'] = 'j123ekyll' browser.submit() browser.open(CFG_SITE_URL + '/record/35') # Dr. Jekyll should be able to connect # (add the pw to the whole CFG_SITE_URL because we shall be # redirected to '/reordrestricted/'): if browser.response().read().find("A High-performance Video Browsing System") > -1: pass else: self.fail("Oops, Dr. Jekyll should be able to access restricted detailed record page.") def test_restricted_detailed_record_page_as_unauthorized_person(self): """websearch - restricted detailed record page not accessible to unauthorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/youraccount/login') browser.select_form(nr=0) browser['p_un'] = 'hyde' browser['p_pw'] = 'h123yde' browser.submit() browser.open(CFG_SITE_URL + '/record/35') # Mr. Hyde should not be able to connect: if browser.response().read().find('You are not authorized') <= -1: # if we got here, things are broken: self.fail("Oops, Mr.Hyde should not be able to access restricted detailed record page.") def test_collection_restricted_p(self): """websearch - collection_restricted_p""" self.failUnless(collection_restricted_p('Theses'), True) self.failIf(collection_restricted_p('Books & Reports')) def test_get_permitted_restricted_collections(self): """websearch - get_permitted_restricted_collections""" from invenio.webuser import get_uid_from_email, collect_user_info self.assertEqual(get_permitted_restricted_collections(collect_user_info(get_uid_from_email('jekyll@cds.cern.ch'))), ['Theses']) self.assertEqual(get_permitted_restricted_collections(collect_user_info(get_uid_from_email('hyde@cds.cern.ch'))), []) class WebSearchRestrictedPicturesTest(unittest.TestCase): """ Check whether restricted pictures on the demo site can be accessed well by people who have rights to access them. """ def test_restricted_pictures_guest(self): """websearch - restricted pictures not available to guest""" error_messages = test_web_page_content(CFG_SITE_URL + '/record/1/files/0106015_01.jpg', expected_text=['This file is restricted. If you think you have right to access it, please authenticate yourself.']) if error_messages: self.fail(merge_error_messages(error_messages)) def test_restricted_pictures_romeo(self): """websearch - restricted pictures available to Romeo""" error_messages = test_web_page_content(CFG_SITE_URL + '/record/1/files/0106015_01.jpg', username='romeo', password='r123omeo', expected_text=[], unexpected_text=['This file is restricted', 'You are not authorized']) if error_messages: self.fail(merge_error_messages(error_messages)) def test_restricted_pictures_hyde(self): """websearch - restricted pictures not available to Mr. Hyde""" error_messages = test_web_page_content(CFG_SITE_URL + '/record/1/files/0106015_01.jpg', username='hyde', password='h123yde', expected_text=['This file is restricted', 'You are not authorized']) if error_messages: self.failUnless("HTTP Error 401: Unauthorized" in merge_error_messages(error_messages)) class WebSearchRSSFeedServiceTest(unittest.TestCase): """Test of the RSS feed service.""" def test_rss_feed_service(self): """websearch - RSS feed service""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/rss', expected_text=' -1: self.fail("Oops, when split by collection is off, " "results overview should not be present.") if body.find('') == -1: self.fail("Oops, when split by collection is off, " "Atlantis collection should be found.") if body.find('') > -1: self.fail("Oops, when split by collection is off, " "Multimedia & Arts should not be found.") try: browser.find_link(url='#15') self.fail("Oops, when split by collection is off, " "a link to Multimedia & Arts should not be found.") except LinkNotFoundError: pass def test_results_overview_split_on(self): """websearch - results overview box when split by collection is on""" browser = Browser() browser.open(CFG_SITE_URL + '/search?p=of&sc=1') body = browser.response().read() if body.find("Results overview") == -1: self.fail("Oops, when split by collection is on, " "results overview should be present.") if body.find('') > -1: self.fail("Oops, when split by collection is on, " "Atlantis collection should not be found.") if body.find('') == -1: self.fail("Oops, when split by collection is on, " "Multimedia & Arts should be found.") try: browser.find_link(url='#15') except LinkNotFoundError: self.fail("Oops, when split by collection is on, " "a link to Multimedia & Arts should be found.") class WebSearchSortResultsTest(unittest.TestCase): """Test of the search results page's sorting capability.""" def test_sort_results_default(self): """websearch - search results sorting, default method""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1', expected_text="[TESLA-FEL-99-07]")) def test_sort_results_ascending(self): """websearch - search results sorting, ascending field""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=a', expected_text="ISOLTRAP")) def test_sort_results_descending(self): """websearch - search results sorting, descending field""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=d', expected_text=" [TESLA-FEL-99-07]")) def test_sort_results_sort_pattern(self): """websearch - search results sorting, preferential sort pattern""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=d&sp=cern', expected_text="[CERN-TH-2002-069]")) class WebSearchSearchResultsXML(unittest.TestCase): """Test search results in various output""" def test_search_results_xm_output_split_on(self): """ websearch - check document element of search results in xm output (split by collection on)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=1&of=xm') body = browser.response().read() num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") def test_search_results_xm_output_split_off(self): """ websearch - check document element of search results in xm output (split by collection off)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=0&of=xm') body = browser.response().read() num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") def test_search_results_xd_output_split_on(self): """ websearch - check document element of search results in xd output (split by collection on)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=1&of=xd') body = browser.response().read() num_doc_element = body.count("" "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") def test_search_results_xd_output_split_off(self): """ websearch - check document element of search results in xd output (split by collection off)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=0&of=xd') body = browser.response().read() num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") class WebSearchUnicodeQueryTest(unittest.TestCase): """Test of the search results for queries containing Unicode characters.""" def test_unicode_word_query(self): """websearch - Unicode word query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%CE%99%CE%B8%CE%AC%CE%BA%CE%B7', expected_text="[76]")) def test_unicode_word_query_not_found_term(self): """websearch - Unicode word query, not found term""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3A%CE%99%CE%B8', expected_text="ιθάκη")) def test_unicode_exact_phrase_query(self): """websearch - Unicode exact phrase query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%22%CE%99%CE%B8%CE%AC%CE%BA%CE%B7%22', expected_text="[76]")) def test_unicode_partial_phrase_query(self): """websearch - Unicode partial phrase query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%27%CE%B7%27', expected_text="[76]")) def test_unicode_regexp_query(self): """websearch - Unicode regexp query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%2F%CE%B7%2F', expected_text="[76]")) class WebSearchMARCQueryTest(unittest.TestCase): """Test of the search results for queries containing physical MARC tags.""" def test_single_marc_tag_exact_phrase_query(self): """websearch - single MARC tag, exact phrase query (100__a)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=100__a%3A%22Ellis%2C+J%22', expected_text="[9, 14, 18]")) def test_single_marc_tag_partial_phrase_query(self): """websearch - single MARC tag, partial phrase query (245__b)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245__b%3A%27and%27', expected_text="[28]")) def test_many_marc_tags_partial_phrase_query(self): """websearch - many MARC tags, partial phrase query (245)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245%3A%27and%27', expected_text="[1, 8, 9, 14, 15, 20, 22, 24, 28, 33, 47, 48, 49, 51, 53, 64, 69, 71, 79, 82, 83, 85, 91, 96]")) def test_single_marc_tag_regexp_query(self): """websearch - single MARC tag, regexp query""" # NOTE: regexp queries for physical MARC tags (e.g. 245:/and/) # are not treated by the search engine by purpose. But maybe # we should support them?! self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245%3A%2Fand%2F', expected_text="[]")) class WebSearchExtSysnoQueryTest(unittest.TestCase): """Test of queries using external system numbers.""" def test_existing_sysno_html_output(self): """websearch - external sysno query, existing sysno, HTML output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CER', expected_text="The wall of the cave")) def test_existing_sysno_id_output(self): """websearch - external sysno query, existing sysno, ID output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CER&of=id', expected_text="[95]")) def test_nonexisting_sysno_html_output(self): """websearch - external sysno query, non-existing sysno, HTML output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CERRRR', expected_text="Requested record does not seem to exist.")) def test_nonexisting_sysno_id_output(self): """websearch - external sysno query, non-existing sysno, ID output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CERRRR&of=id', expected_text="[]")) class WebSearchResultsRecordGroupingTest(unittest.TestCase): """Test search results page record grouping (rg).""" def test_search_results_rg_guest(self): """websearch - search results, records in groups of, guest""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?rg=17', expected_text="1 - 17")) def test_search_results_rg_nonguest(self): """websearch - search results, records in groups of, non-guest""" # This test used to fail due to saved user preference fetching # not overridden by URL rg argument. self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?rg=17', username='admin', expected_text="1 - 17")) class WebSearchSpecialTermsQueryTest(unittest.TestCase): """Test of the search results for queries containing special terms.""" def test_special_terms_u1(self): """websearch - query for special terms, U(1)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29', expected_text="[57, 79, 80, 88]")) def test_special_terms_u1_and_sl(self): """websearch - query for special terms, U(1) SL(2,Z)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29+SL%282%2CZ%29', expected_text="[88]")) def test_special_terms_u1_and_sl_or(self): """websearch - query for special terms, U(1) OR SL(2,Z)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29+OR+SL%282%2CZ%29', expected_text="[57, 79, 80, 88]")) def test_special_terms_u1_and_sl_or_parens(self): """websearch - query for special terms, (U(1) OR SL(2,Z))""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=%28U%281%29+OR+SL%282%2CZ%29%29', expected_text="[57, 79, 80, 88]")) class WebSearchJournalQueryTest(unittest.TestCase): """Test of the search results for journal pubinfo queries.""" def test_query_journal_title_only(self): """websearch - journal publication info query, title only""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&f=journal&p=Phys.+Lett.+B', expected_text="[77, 78, 85, 87]")) def test_query_journal_full_pubinfo(self): """websearch - journal publication info query, full reference""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&f=journal&p=Phys.+Lett.+B+531+%282002%29+301', expected_text="[78]")) class WebSearchStemmedIndexQueryTest(unittest.TestCase): """Test of the search results for queries using stemmed indexes.""" def test_query_stemmed_lowercase(self): """websearch - stemmed index query, lowercase""" # note that dasse/Dasse is stemmed into dass/Dass, as expected self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=dasse', expected_text="[25, 26]")) def test_query_stemmed_uppercase(self): """websearch - stemmed index query, uppercase""" # ... but note also that DASSE is stemmed into DASSE(!); so # the test would fail if the search engine would not lower the # query term. (Something that is not necessary for # non-stemmed indexes.) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=DASSE', expected_text="[25, 26]")) class WebSearchSummarizerTest(unittest.TestCase): """Test of the search results summarizer functions.""" def test_most_popular_field_values_singletag(self): """websearch - most popular field values, simple tag""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('PREPRINT', 37), ('ARTICLE', 28), ('BOOK', 14), ('THESIS', 8), ('PICTURE', 7), ('POETRY', 2), ('REPORT', 2), ('ATLANTISTIMESNEWS', 1)), get_most_popular_field_values(range(0,100), '980__a')) def test_most_popular_field_values_singletag_multiexclusion(self): """websearch - most popular field values, simple tag, multiple exclusions""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('PREPRINT', 37), ('ARTICLE', 28), ('BOOK', 14), ('REPORT', 2), ('ATLANTISTIMESNEWS', 1)), get_most_popular_field_values(range(0,100), '980__a', ('THESIS', 'PICTURE', 'POETRY'))) def test_most_popular_field_values_multitag(self): """websearch - most popular field values, multiple tags""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('Ellis, J', 3), ('Enqvist, K', 1), ('Ibanez, L E', 1), ('Nanopoulos, D V', 1), ('Ross, G G', 1)), get_most_popular_field_values((9, 14, 18), ('100__a', '700__a'))) def test_most_popular_field_values_multitag_singleexclusion(self): """websearch - most popular field values, multiple tags, single exclusion""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('Enqvist, K', 1), ('Ibanez, L E', 1), ('Nanopoulos, D V', 1), ('Ross, G G', 1)), get_most_popular_field_values((9, 14, 18), ('100__a', '700__a'), ('Ellis, J'))) def test_most_popular_field_values_multitag_countrepetitive(self): """websearch - most popular field values, multiple tags, counting repetitive occurrences""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('THESIS', 2), ('REPORT', 1)), get_most_popular_field_values((41,), ('690C_a', '980__a'), count_repetitive_values=True)) self.assertEqual((('REPORT', 1), ('THESIS', 1)), get_most_popular_field_values((41,), ('690C_a', '980__a'), count_repetitive_values=False)) def test_ellis_citation_summary(self): """websearch - query ellis, citation summary output format""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis&of=hcs', expected_text="Less known papers (1-9)", expected_link_target=CFG_SITE_URL+"/search?p=ellis%20AND%20cited%3A1-%3E9&rm=citation", expected_link_label='1')) class WebSearchRecordCollectionGuessTest(unittest.TestCase): """Primary collection guessing tests.""" def test_guess_primary_collection_of_a_record(self): """websearch - guess_primary_collection_of_a_record""" self.assertEqual(guess_primary_collection_of_a_record(96), 'Articles') def test_guess_collection_of_a_record(self): """websearch - guess_collection_of_a_record""" self.assertEqual(guess_collection_of_a_record(96), 'Articles') self.assertEqual(guess_collection_of_a_record(96, '%s/collection/Theoretical Physics (TH)?ln=en' % CFG_SITE_URL), 'Articles') self.assertEqual(guess_collection_of_a_record(12, '%s/collection/Theoretical Physics (TH)?ln=en' % CFG_SITE_URL), 'Theoretical Physics (TH)') self.assertEqual(guess_collection_of_a_record(12, '%s/collection/Theoretical%%20Physics%%20%%28TH%%29?ln=en' % CFG_SITE_URL), 'Theoretical Physics (TH)') class WebSearchGetFieldValuesTest(unittest.TestCase): """Testing get_fieldvalues() function.""" def test_get_fieldvalues_001(self): """websearch - get_fieldvalues() for bibxxx-agnostic tags""" self.assertEqual(get_fieldvalues(10, '001___'), ['10']) def test_get_fieldvalues_980(self): """websearch - get_fieldvalues() for bibxxx-powered tags""" self.assertEqual(get_fieldvalues(18, '700__a'), ['Enqvist, K', 'Nanopoulos, D V']) self.assertEqual(get_fieldvalues(18, '909C1u'), ['CERN']) def test_get_fieldvalues_wildcard(self): """websearch - get_fieldvalues() for tag wildcards""" self.assertEqual(get_fieldvalues(18, '%'), []) self.assertEqual(get_fieldvalues(18, '7%'), []) self.assertEqual(get_fieldvalues(18, '700%'), ['Enqvist, K', 'Nanopoulos, D V']) self.assertEqual(get_fieldvalues(18, '909C0%'), ['1985', '13','TH']) def test_get_fieldvalues_recIDs(self): """websearch - get_fieldvalues() for list of recIDs""" self.assertEqual(get_fieldvalues([], '001___'), []) self.assertEqual(get_fieldvalues([], '700__a'), []) self.assertEqual(get_fieldvalues([10, 13], '001___'), ['10', '13']) self.assertEqual(get_fieldvalues([18, 13], '700__a'), ['Dawson, S', 'Ellis, R K', 'Enqvist, K', 'Nanopoulos, D V']) def test_get_fieldvalues_repetitive(self): """websearch - get_fieldvalues() for repetitive values""" self.assertEqual(get_fieldvalues([17, 18], '909C1u'), ['CERN', 'CERN']) self.assertEqual(get_fieldvalues([17, 18], '909C1u', repetitive_values=True), ['CERN', 'CERN']) self.assertEqual(get_fieldvalues([17, 18], '909C1u', repetitive_values=False), ['CERN']) class WebSearchAddToBasketTest(unittest.TestCase): """Test of the add-to-basket presence depending on user rights.""" def test_add_to_basket_guest(self): """websearch - add-to-basket facility allowed for guests""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='Add to basket')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='')) def test_add_to_basket_jekyll(self): """websearch - add-to-basket facility allowed for Dr. Jekyll""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='Add to basket', username='jekyll', password='j123ekyll')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='', username='jekyll', password='j123ekyll')) def test_add_to_basket_hyde(self): """websearch - add-to-basket facility denied to Mr. Hyde""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', unexpected_text='Add to basket', username='hyde', password='h123yde')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', unexpected_text='', username='hyde', password='h123yde')) class WebSearchAlertTeaserTest(unittest.TestCase): """Test of the alert teaser presence depending on user rights.""" def test_alert_teaser_guest(self): """websearch - alert teaser allowed for guests""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_link_label='email alert')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='RSS feed')) def test_alert_teaser_jekyll(self): """websearch - alert teaser allowed for Dr. Jekyll""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='email alert', username='jekyll', password='j123ekyll')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='RSS feed', username='jekyll', password='j123ekyll')) def test_alert_teaser_hyde(self): """websearch - alert teaser allowed for Mr. Hyde""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='email alert', username='hyde', password='h123yde')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='RSS feed', username='hyde', password='h123yde')) class WebSearchSpanQueryTest(unittest.TestCase): """Test of span queries.""" def test_span_in_word_index(self): """websearch - span query in a word index""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=year%3A1992-%3E1996&of=id&ap=0', expected_text='[17, 66, 69, 71]')) def test_span_in_phrase_index(self): """websearch - span query in a phrase index""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=year%3A%221992%22-%3E%221996%22&of=id&ap=0', expected_text='[17, 66, 69, 71]')) def test_span_in_bibxxx(self): """websearch - span query in MARC tables""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=909C0y%3A%221992%22-%3E%221996%22&of=id&ap=0', expected_text='[17, 66, 69, 71]')) def test_span_with_spaces(self): """websearch - no span query when a space is around""" # useful for reaction search self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3A%27mu%20--%3E%20e%27&of=id&ap=0', expected_text='[67]')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=245%3A%27mu%20--%3E%20e%27&of=id&ap=0', expected_text='[67]')) def test_span_in_author(self): """websearch - span query in special author index""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis,%20K%22-%3E%22Ellis,%20RZ%22&of=id&ap=0', expected_text='[8, 11, 13, 17, 47]')) class WebSearchReferstoCitedbyTest(unittest.TestCase): """Test of refersto/citedby search operators.""" def test_refersto_recid(self): 'websearch - refersto:recid:84' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Arecid%3A84&of=id&ap=0', expected_text='[85, 88, 91]')) def test_refersto_repno(self): 'websearch - refersto:reportnumber:hep-th/0205061' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Areportnumber%3Ahep-th/0205061&of=id&ap=0', expected_text='[91]')) def test_refersto_author_word(self): 'websearch - refersto:author:klebanov' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Aauthor%3Aklebanov&of=id&ap=0', expected_text='[85, 86, 88, 91]')) def test_refersto_author_phrase(self): 'websearch - refersto:author:"Klebanov, I"' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Aauthor%3A%22Klebanov,%20I%22&of=id&ap=0', expected_text='[85, 86, 88, 91]')) def test_citedby_recid(self): 'websearch - citedby:recid:92' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Arecid%3A92&of=id&ap=0', expected_text='[74, 91]')) def test_citedby_repno(self): 'websearch - citedby:reportnumber:hep-th/0205061' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Areportnumber%3Ahep-th/0205061&of=id&ap=0', expected_text='[78]')) def test_citedby_author_word(self): 'websearch - citedby:author:klebanov' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Aauthor%3Aklebanov&of=id&ap=0', expected_text='[95]')) def test_citedby_author_phrase(self): 'websearch - citedby:author:"Klebanov, I"' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Aauthor%3A%22Klebanov,%20I%22&of=id&ap=0', expected_text='[95]')) +class WebSearchSPIRESSyntaxTest(unittest.TestCase): + """Test of SPIRES syntax issues""" + + def test_and_not_parens(self): + 'websearch - find a ellis, j and not a enqvist' + self.assertEqual([], + test_web_page_content(CFG_SITE_URL +'/search?p=find+a+ellis%2C+j+and+not+a+enqvist&of=id&ap=0', + expected_text='[9, 12, 14, 47]')) + + TEST_SUITE = make_test_suite(WebSearchWebPagesAvailabilityTest, WebSearchTestSearch, WebSearchTestBrowse, WebSearchTestOpenURL, WebSearchTestCollections, WebSearchTestRecord, WebSearchTestLegacyURLs, WebSearchNearestTermsTest, WebSearchBooleanQueryTest, WebSearchAuthorQueryTest, WebSearchSearchEnginePythonAPITest, WebSearchSearchEngineWebAPITest, WebSearchRestrictedCollectionTest, WebSearchRestrictedPicturesTest, WebSearchRSSFeedServiceTest, WebSearchXSSVulnerabilityTest, WebSearchResultsOverview, WebSearchSortResultsTest, WebSearchSearchResultsXML, WebSearchUnicodeQueryTest, WebSearchMARCQueryTest, WebSearchExtSysnoQueryTest, WebSearchResultsRecordGroupingTest, WebSearchSpecialTermsQueryTest, WebSearchJournalQueryTest, WebSearchStemmedIndexQueryTest, WebSearchSummarizerTest, WebSearchRecordCollectionGuessTest, WebSearchGetFieldValuesTest, WebSearchAddToBasketTest, WebSearchAlertTeaserTest, WebSearchSpanQueryTest, - WebSearchReferstoCitedbyTest) + WebSearchReferstoCitedbyTest, + WebSearchSPIRESSyntaxTest) if __name__ == "__main__": run_test_suite(TEST_SUITE, warn_user=True)