"""Splits search pattern and search field into a list of independently searchable units.
- A search unit consists of '(operator, pattern, field, type, hitset)' tuples where
'operator' is set union (|), set intersection (+) or set exclusion (-);
'pattern' is either a word (e.g. muon*) or a phrase (e.g. 'nuclear physics');
'field' is either a code like 'title' or MARC tag like '100__a';
'type' is the search type ('w' for word file search, 'a' for access file search).
- Optionally, the function accepts the match type argument 'm'.
If it is set (e.g. from advanced search interface), then it
performs this kind of matching. If it is not set, then a guess is made.
'm' can have values: 'a'='all of the words', 'o'='any of the words',
'p'='phrase/substring', 'r'='regular expression',
'e'='exact value'.
- Warnings are printed on req (when not None) in case of HTML output formats."""
opfts = [] # will hold (o,p,f,t,h) units
## check arguments: if matching type phrase/string/regexp, do we have field defined?
if (m=='p' or m=='r' or m=='e') and not f:
m = 'a'
if of.startswith("h"):
print_warning(req, "This matching type cannot be used within <em>any field</em>. I will perform a word search instead." )
print_warning(req, "If you want to phrase/substring/regexp search in a specific field, e.g. inside title, then please choose <em>within title</em> search option.")
## is desired matching type set?
if m:
## A - matching type is known; good!
if m == 'e':
# A1 - exact value:
opfts.append(['|',p,f,'a']) # '|' since we have only one unit
elif m == 'p':
# A2 - phrase/substring:
opfts.append(['|',"%"+p+"%",f,'a']) # '|' since we have only one unit
elif m == 'r':
# A3 - regular expression:
opfts.append(['|',p,f,'r']) # '|' since we have only one unit
elif m == 'a' or m == 'w':
# A4 - all of the words:
p = strip_accents(p) # strip accents for 'w' mode, FIXME: delete when not needed
for word in get_words_from_pattern(p):
if len(opfts)==0:
opfts.append(['|',word,f,'w']) # '|' in the first unit
else:
opfts.append(['+',word,f,'w']) # '+' in further units
elif m == 'o':
# A5 - any of the words:
p = strip_accents(p) # strip accents for 'w' mode, FIXME: delete when not needed
for word in get_words_from_pattern(p):
opfts.append(['|',word,f,'w']) # '|' in all units
else:
if of.startswith("h"):
print_warning(req, "Matching type '%s' is not implemented yet." % m, "Warning")
opfts.append(['|',"%"+p+"%",f,'a'])
else:
## B - matching type is not known: let us try to determine it by some heuristics
if f and p[0]=='"' and p[-1]=='"':
## B0 - does 'p' start and end by double quote, and is 'f' defined? => doing ACC search
opfts.append(['|',p[1:-1],f,'a'])
elif f and p[0]=="'" and p[-1]=="'":
## B0bis - does 'p' start and end by single quote, and is 'f' defined? => doing ACC search
opfts.append(['|','%'+p[1:-1]+'%',f,'a'])
elif f and p[0]=="/" and p[-1]=="/":
## B0ter - does 'p' start and end by a slash, and is 'f' defined? => doing regexp search
opfts.append(['|',p[1:-1],f,'r'])
elif f and string.find(p, ',') >= 0:
## B1 - does 'p' contain comma, and is 'f' defined? => doing ACC search
opfts.append(['|',p,f,'a'])
elif f and str(f[0:2]).isdigit():
## B2 - does 'f' exist and starts by two digits? => doing ACC search
opfts.append(['|',p,f,'a'])
else:
## B3 - doing WRD search, but maybe ACC too
# search units are separated by spaces unless the space is within single or double quotes
# so, let us replace temporarily any space within quotes by '__SPACE__'
p = sre_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", p)
p = sre_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", p)
p = sre_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", p)
# wash argument:
p = sre_equal.sub(":", p)
p = sre_logical_and.sub(" ", p)
p = sre_logical_or.sub(" |", p)
p = sre_logical_not.sub(" -", p)
p = sre_operators.sub(r' \1', p)
for pi in split(p): # iterate through separated units (or items, as "pi" stands for "p item")
pi = sre_pattern_space.sub(" ", pi) # replace back '__SPACE__' by ' '
# firstly, determine set operator
if pi[0] == '+' or pi[0] == '-' or pi[0] == '|':
if len(opfts) or pi[0] == '-': # either not first unit, or '-' for the first unit
oi = pi[0]
else:
oi = "|" # we are in the first unit and operator is not '-', so let us do
# set union (with still null result set)
pi = pi[1:]
else:
# okay, there is no operator, so let us decide what to do by default
if len(opfts):
oi = '+' # by default we are doing set intersection...
else:
oi = "|" # ...unless we are in the first unit
# secondly, determine search pattern and field:
if string.find(pi, ":") > 0:
fi, pi = split(pi, ":", 1)
else:
fi, pi = f, pi
# look also for old ALEPH field names:
if fi and cfg_fields_convert.has_key(string.lower(fi)):
fi = cfg_fields_convert[string.lower(fi)]
# wash 'pi' argument:
if sre_quotes.match(pi):
# B3a - quotes are found => do ACC search (phrase search)
if fi:
if pi[0] == '"' and pi[-1] == '"':
pi = string.replace(pi, '"', '') # remove quote signs
opfts.append([oi,pi,fi,'a'])
elif pi[0] == "'" and pi[-1] == "'":
pi = string.replace(pi, "'", "") # remove quote signs
opfts.append([oi,"%"+pi+"%",fi,'a'])
else: # unbalanced quotes, so do WRD query:
opfts.append([oi,pi,fi,'w'])
else:
# fi is not defined, look at where we are doing exact or subphrase search (single/double quotes):
if pi[0]=='"' and pi[-1]=='"':
opfts.append([oi,pi[1:-1],"anyfield",'a'])
if of.startswith("h"):
print_warning(req, "Searching for an exact match inside any field may be slow. You may want to search for words instead, or choose to search within specific field.")
else:
# nope, subphrase in global index is not possible => change back to WRD search
pi = strip_accents(pi) # strip accents for 'w' mode, FIXME: delete when not needed
for pii in get_words_from_pattern(pi):
# since there may be '-' and other chars that we do not index in WRD
opfts.append([oi,pii,fi,'w'])
if of.startswith("h"):
print_warning(req, "The partial phrase search does not work in any field. I'll do a boolean AND searching instead.")
print_warning(req, "If you want to do a partial phrase search in a specific field, e.g. inside title, then please choose 'within title' search option.", "Tip")
print_warning(req, "If you want to do exact phrase matching, then please use double quotes.", "Tip")
elif fi and str(fi[0]).isdigit() and str(fi[0]).isdigit():
# B3b - fi exists and starts by two digits => do ACC search
opfts.append([oi,pi,fi,'a'])
elif fi and not get_index_id(fi):
# B3c - fi exists but there is no words table for fi => try ACC search
opfts.append([oi,pi,fi,'a'])
elif fi and pi.startswith('/') and pi.endswith('/'):
# B3d - fi exists and slashes found => try regexp search
opfts.append([oi,pi[1:-1],fi,'r'])
else:
# B3e - general case => do WRD search
pi = strip_accents(pi) # strip accents for 'w' mode, FIXME: delete when not needed
# speed up HitSet operations by ~20% if Psyco is installed:
try:
import psyco
psyco.bind(HitSet)
except:
pass
def escape_string(s):
"Escapes special chars in string. For MySQL queries."
s = MySQLdb.escape_string(s)
return s
def wash_colls(cc, c, split_colls=0):
"""Wash collection list by checking whether user has deselected
anything under 'Narrow search'. Checks also if cc is a list or not.
Return list of cc, colls_to_display, colls_to_search since the list
of collections to display is different from that to search in.
This is because users might have chosen 'split by collection'
functionality.
The behaviour of "collections to display" depends solely whether
user has deselected a particular collection: e.g. if it started
from 'Articles and Preprints' page, and deselected 'Preprints',
then collection to display is 'Articles'. If he did not deselect
anything, then collection to display is 'Articles & Preprints'.
The behaviour of "collections to search in" depends on the
'split_colls' parameter:
* if is equal to 1, then we can wash the colls list down
and search solely in the collection the user started from;
* if is equal to 0, then we are splitting to the first level
of collections, i.e. collections as they appear on the page
we started to search from;
"""
colls_out = []
colls_out_for_display = []
# check what type is 'cc':
if type(cc) is list:
for ci in cc:
if collection_reclist_cache.has_key(ci):
# yes this collection is real, so use it:
cc = ci
break
else:
# check once if cc is real:
if not collection_reclist_cache.has_key(cc):
cc = cdsname # cc is not real, so replace it with Home collection
# check type of 'c' argument:
if type(c) is list:
colls = c
else:
colls = [c]
# remove all 'unreal' collections:
colls_real = []
for coll in colls:
if collection_reclist_cache.has_key(coll):
colls_real.append(coll)
colls = colls_real
# check if some real collections remain:
if len(colls)==0:
colls = [cc]
# then let us check the list of non-restricted "real" sons of 'cc' and compare it to 'coll':
query = "SELECT c.name FROM collection AS c, collection_collection AS cc, collection AS ccc WHERE c.id=cc.id_son AND cc.id_dad=ccc.id AND ccc.name='%s' AND cc.type='r' AND c.restricted IS NULL" % MySQLdb.escape_string(cc)
res = run_sql(query)
l_cc_nonrestricted_sons = []
l_c = colls
for row in res:
l_cc_nonrestricted_sons.append(row[0])
l_c.sort()
l_cc_nonrestricted_sons.sort()
if l_cc_nonrestricted_sons == l_c:
colls_out_for_display = [cc] # yep, washing permitted, it is sufficient to display 'cc'
else:
colls_out_for_display = colls # nope, we need to display all 'colls' successively
# remove duplicates:
colls_out_for_display_nondups=filter(lambda x, colls_out_for_display=colls_out_for_display: colls_out_for_display[x-1] not in colls_out_for_display[x:], range(1, len(colls_out_for_display)+1))
"""Sort records in 'recIDs' list according sort field 'sort_field' in order 'sort_order'.
If more than one instance of 'sort_field' is found for a given record, try to choose that that is given by
'sort pattern', for example "sort by report number that starts by CERN-PS".
Note that 'sort_field' can be field code like 'author' or MARC tag like '100__a' directly."""
## check arguments:
if not sort_field:
return recIDs
if len(recIDs) > cfg_nb_records_to_sort:
print_warning(req, "Sorry, sorting is allowed on sets of up to %d records only. Using default sort order (\"latest first\")." % cfg_nb_records_to_sort,"Warning")
return recIDs
sort_fields = string.split(sort_field, ",")
recIDs_dict = {}
recIDs_out = []
## first deduce sorting MARC tag out of the 'sort_field' argument:
tags = []
for sort_field in sort_fields:
if sort_field and str(sort_field[0:2]).isdigit():
# sort_field starts by two digits, so this is probably a MARC tag already
tags.append(sort_field)
else:
# let us check the 'field' table
query = """SELECT DISTINCT(t.value) FROM tag AS t, field_tag AS ft, field AS f
WHERE f.code='%s' AND ft.id_field=f.id AND t.id=ft.id_tag
ORDER BY ft.score DESC""" % sort_field
res = run_sql(query)
if res:
for row in res:
tags.append(row[0])
else:
print_warning(req, "Sorry, '%s' does not seem to be a valid sort option. Choosing title sort instead." % sort_field, "Error")
tags.append("245__a")
if verbose >= 3:
print_warning(req, "Sorting by tags %s." % tags)
if sort_pattern:
print_warning(req, "Sorting preferentially by %s." % sort_pattern)
## check if we have sorting tag defined:
if tags:
# fetch the necessary field values:
for recID in recIDs:
val = "" # will hold value for recID according to which sort
vals = [] # will hold all values found in sorting tag for recID
for tag in tags:
vals.extend(get_fieldvalues(recID, tag))
if sort_pattern:
# try to pick that tag value that corresponds to sort pattern
bingo = 0
for v in vals:
if v.startswith(sort_pattern): # bingo!
bingo = 1
val = v
break
if not bingo: # sort_pattern not present, so add other vals after spaces
val = sort_pattern + " " + string.join(vals)
else:
# no sort pattern defined, so join them all together
res = run_sql("SHOW TABLE STATUS LIKE 'collection'")
out += "- collection table last updated: %s" % str(res[0][11])
out += "<br>- reclist cache timestamp: %s" % collection_reclist_cache_timestamp
out += "<br>- reclist cache contents:"
out += "<blockquote>"
for coll in collection_reclist_cache.keys():
if collection_reclist_cache[coll]:
out += "%s (%d)<br>" % (coll, get_collection_reclist(coll)._nbhits)
out += "</blockquote>"
# show search cache:
out += "<h3>Search Cache</h3>"
out += "<blockquote>"
if len(search_cache):
out += """<table border="=">"""
out += "<tr><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td></tr>" % ("Pattern","Field","Collection","Number of Hits")
for search_cache_key in search_cache.keys():
p, f, c = string.split(search_cache_key, "@", 2)
# find out about length of cached data:
l = 0
for coll in search_cache[search_cache_key]:
l += search_cache[search_cache_key][coll]._nbhits
out += "<tr><td>%s</td><td>%s</td><td>%s</td><td>%d</td></tr>" % (p, f, c, l)
out += "</table>"
else:
out += "<p>Search cache is empty."
out += "</blockquote>"
out += """<p><a href="%s/search.py/cache?action=clear">clear cache</a>""" % weburl
# show field i18nname cache:
out += "<h3>Field I18N names cache</h3>"
res = run_sql("SHOW TABLE STATUS LIKE 'fieldname'")
out += "- fieldname table last updated: %s" % str(res[0][11])
out += "<br>- i18nname cache timestamp: %s" % field_i18nname_cache_timestamp
out += "<br>- i18nname cache contents:"
out += "<blockquote>"
for field in field_i18nname_cache.keys():
for ln in field_i18nname_cache[field].keys():
out += "%s, %s = %s<br>" % (field, ln, field_i18nname_cache[field][ln])
out += "</blockquote>"
# show collection i18nname cache:
out += "<h3>Collection I18N names cache</h3>"
res = run_sql("SHOW TABLE STATUS LIKE 'collectionname'")
out += "- collectionname table last updated: %s" % str(res[0][11])
out += "<br>- i18nname cache timestamp: %s" % collection_i18nname_cache_timestamp
out += "<br>- i18nname cache contents:"
out += "<blockquote>"
for coll in collection_i18nname_cache.keys():
for ln in collection_i18nname_cache[coll].keys():
out += "%s, %s = %s<br>" % (coll, ln, collection_i18nname_cache[coll][ln])
out += "</blockquote>"
req.write(out)
return "\n"
def perform_request_log(req, date=""):
"""Display search log information for given date."""