## check arguments: if matching type phrase/string/regexp, do we have field defined?
if (m=='p' or m=='r' or m=='e') and not f:
m = 'a'
print_warning(req, "This matching type cannot be used within <em>any field</em>. I will perform a word search instead." , "Warning")
print_warning(req, "If you want to phrase/substring/regexp search in a specific field, e.g. inside title, then please choose <em>within title</em> search option.", "Tip")
## is desired matching type set?
if m:
## A - matching type is known; good!
if m == 'e':
# A1 - exact value:
opfts.append(['|',p,f,'a']) # '|' since we have only one unit
elif m == 'p':
# A2 - phrase/substring:
opfts.append(['|',"%"+p+"%",f,'a']) # '|' since we have only one unit
elif m == 'r':
# A3 - regular expression:
opfts.append(['|',p,f,'r']) # '|' since we have only one unit
elif m == 'a':
# A4 - all of the words:
for word in get_words_from_phrase(p):
if len(opfts)==0:
opfts.append(['|',word,f,'w']) # '|' in the first unit
else:
opfts.append(['+',word,f,'w']) # '+' in further units
elif m == 'o':
# A5 - any of the words:
for word in get_words_from_phrase(p):
opfts.append(['|',word,f,'w']) # '|' in all units
else:
print_warning(req, "Matching type '%s' is not implemented yet." % m, "Warning")
opfts.append(['|',"%"+p+"%",f,'a'])
else:
## B - matching type is not known: let us try to determine it by some heuristics
if f and p[0]=='"' and p[-1]=='"':
## B0 - does 'p' start and end by double quote, and is 'f' defined? => doing ACC search
opfts.append(['|',p[1:-1],f,'a'])
elif f and p[0]=="'" and p[-1]=="'":
## B0bis - does 'p' start and end by single quote, and is 'f' defined? => doing ACC search
opfts.append(['|','%'+p[1:-1]+'%',f,'a'])
elif f and string.find(p, ',') >= 0:
## B1 - does 'p' contain comma, and is 'f' defined? => doing ACC search
opfts.append(['|',p,f,'a'])
elif f and str(f[0:2]).isdigit():
## B2 - does 'f' exist and starts by two digits? => doing ACC search
opfts.append(['|',p,f,'a'])
else:
## B3 - doing WRD search, but maybe ACC too
# search units are separated by spaces unless the space is within single or double quotes
# so, let us replace temporarily any space within quotes by '__SPACE__'
p = re.sub("'(.*?)'", lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", p)
p = re.sub("\"(.*?)\"", lambda x: "\""+string.replace(x.group(1), ' ', '__SPACEBIS__')+"\"", p)
for pi in split(p, ' '): # iterate through separated units (or items, as "pi" stands for "p item")
pi = re.sub("__SPACE__", " ", pi) # replace back '__SPACE__' by ' '
pi = re.sub("__SPACEBIS__", " ", pi) # replace back '__SPACEBIS__' by ' '
# firstly, determine set operand
if pi[0] == '+' or pi[0] == '-' or pi[0] == '|':
if len(opfts) or pi[0] == '-': # either not first unit, or '-' for the first unit
oi = pi[0]
else:
oi = "|" # we are in the first unit and operand is not '-', so let us do
# set union (with still null result set)
pi = pi[1:]
else:
# okay, there is no operand, so let us decide what to do by default
if len(opfts):
oi = '+' # by default we are doing set intersection...
else:
oi = "|" # ...unless we are in the first unit
# secondly, determine search pattern and field:
if string.find(pi, ":") > 0:
fi, pi = split(pi, ":", 1)
else:
fi, pi = f, pi
# look also for old ALEPH field names:
if fi and cfg_fields_convert.has_key(string.lower(fi)):
fi = cfg_fields_convert[string.lower(fi)]
# wash 'pi' argument:
if re_quotes.match(pi):
# B3a - quotes are found => do ACC search (phrase search)
if fi:
if re_doublequote.match(pi):
pi = string.replace(pi, '"', '') # get rid of quotes
opfts.append([oi,pi,fi,'a'])
else:
pi = string.replace(pi, "'", '') # get rid of quotes
opfts.append([oi,"%"+pi+"%",fi,'a'])
else:
# fi is not defined, look at where we are doing exact or subphrase search (single/double quotes):
if pi[0]=='"' and pi[-1]=='"':
opfts.append([oi,pi[1:-1],"anyfield",'a'])
print_warning(req, "Searching for an exact match inside any field may be slow. You may want to try to search for words instead, or choose a search within specific field.", "Warning")
else:
# nope, subphrase in global index is not possible => change back to WRD search
for pii in get_words_from_phrase(pi):
# since there may be '-' and other chars that we do not index in WRD
opfts.append([oi,pii,fi,'w'])
print_warning(req, "The sub-phrase search does not work in any field. I'll do a 'logical AND' style of search instead.", "Warning")
print_warning(req, "If you want to do a sub-phrase search in a specific field, e.g. inside title, then please choose 'within title' search option.", "Tip")
print_warning(req, "If you want to do exact phrase matching, then please use double quotes.", "Tip")
elif fi and str(fi[0]).isdigit() and str(fi[0]).isdigit():
# B3b - fi exists and starts by two digits => do ACC search
opfts.append([oi,pi,fi,'a'])
elif fi and not get_wordsindex_id(fi):
# B3c - fi exists but there is no words table for fi => try ACC search
# speed up HitList operations by ~20% if Psyco is installed:
try:
import psyco
psyco.bind(HitList)
except:
pass
def escape_string(s):
"Escapes special chars in string. For MySQL queries."
s = MySQLdb.escape_string(s)
return s
def asciify_accented_letters(s):
"Translates ISO-8859-1 accented letters into their ASCII equivalents."
s = string.translate(s, table_latin1_to_ascii)
return s
def wash_colls(cc, c, split_colls=0):
"""Wash collection list by checking whether user has deselected
anything under 'Narrow search'. Checks also if cc is a list or not.
Return list of cc, colls_to_display, colls_to_search since the list
of collections to display is different from that to search in.
This is because users might have chosen 'split by collection'
functionality.
The behaviour of "collections to display" depends solely whether
user has deselected a particular collection: e.g. if it started
from 'Articles and Preprints' page, and deselected 'Preprints',
then collection to display is 'Articles'. If he did not deselect
anything, then collection to display is 'Articles & Preprints'.
The behaviour of "collections to search in" depends on the
'split_colls' parameter:
* if is equal to 0, then we can wash the colls list down
and search solely in the collection the user started from;
* if is equal to 1, then we are splitting to the first level
of collections, i.e. collections as they appear on the page
we started to search from;
* if it is equal to 9, then we are splitting right to the
bottom level.
"""
colls_out = []
colls_out_for_display = []
# check what type is 'cc':
if type(cc) is list:
for ci in cc:
if collrecs_cache.has_key(ci):
# yes this collection is real, so use it:
cc = ci
break
else:
# check once if cc is real:
if not collrecs_cache.has_key(cc):
cc = cdsname # cc is not real, so replace it with Home collection
# check type of 'c' argument:
if type(c) is list:
colls = c
else:
colls = [c]
# remove all 'unreal' collections:
colls_real = []
for coll in colls:
if collrecs_cache.has_key(coll):
colls_real.append(coll)
colls = colls_real
# check if some real collections remain:
if len(colls)==0:
colls = [cc]
# then let us check the number of sons of 'cc':
query = "SELECT COUNT(cc.id_son) FROM collection_collection AS cc, collection AS c " \
"WHERE c.name='%s' AND c.id=cc.id_dad" % cc
res = run_sql(query, None, 1)
if res and res[0][0] == len(colls):
colls_out_for_display = [cc] # yep, washing permitted, it is sufficient to display 'cc'
else:
colls_out_for_display = colls # nope, we need to display all 'colls' successively
# remove duplicates:
colls_out_for_display_nondups=filter(lambda x, colls_out_for_display=colls_out_for_display: colls_out_for_display[x-1] not in colls_out_for_display[x:], range(1, len(colls_out_for_display)+1))
# construct 'tl' which defines the tag list (MARC tags) to search in:
tl = []
if str(f[0]).isdigit() and str(f[1]).isdigit():
tl.append(f) # 'f' seems to be okay as it starts by two digits
else:
# convert old ALEPH tag names, if appropriate: (TODO: get rid of this before entering this function)
if cfg_fields_convert.has_key(string.lower(f)):
f = cfg_fields_convert[string.lower(f)]
# deduce desired MARC tags on the basis of chosen 'f'
tl = get_field_tags(f)
if not tl:
# by default we are searching in author index:
tl = get_field_tags("author")
print_warning(req, "The phrase or access file search does not work in this field. Choosing author index instead.", "Warning")
# okay, start search:
l = [] # will hold list of recID that matched
for t in tl:
# deduce into which bibxxx table we will search:
digit1, digit2 = int(t[0]), int(t[1])
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
# construct query:
if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character:
query = "SELECT bibx.id_bibrec FROM %s AS bx LEFT JOIN %s AS bibx ON bx.id=bibx.id_bibxxx WHERE bx.value %s AND bx.tag LIKE '%s%%'" %\
(bx, bibx, pattern, t)
else:
query = "SELECT bibx.id_bibrec FROM %s AS bx LEFT JOIN %s AS bibx ON bx.id=bibx.id_bibxxx WHERE bx.value %s AND bx.tag='%s'" %\
(bx, bibx, pattern, t)
if dbg:
print_warning(req, query, "Debug")
# launch the query:
res = run_sql(query)
# fill the result set:
for id_bibrec in res:
if id_bibrec[0]:
l.append(id_bibrec[0])
# check no of hits found:
nb_hits = len(l)
if dbg:
print_warning(req, "The pattern '%s' in field '%s' has got '%d' hits." % (p, f, nb_hits), "Info")
# check whether it is sound to do a new search:
if nb_hits == 0 and not (p.startswith("%") and p.endswith("%")):
# try to launch substring search:
p_new = "%" + p + "%"
print_warning(req, "Your original search for <strong>%s</strong> within <strong>%s</strong> field did not return any result. Looking for subphrase/substring match..." % \
"""Sort records in 'recIDs' list according sort field 'sort_field' in order 'sort_order'.
If more than one instance of 'sort_field' is found for a given record, try to choose that that is given by
'sort pattern', for example "sort by report number that starts by CERN-PS".
Note that 'sort_field' can be field code like 'author' or MARC tag like '100__a' directly."""
## check arguments:
if not sort_field:
return recIDs
if len(recIDs) > cfg_nb_records_to_sort:
print_warning(req, "Sorry, sorting is allowed on sets of up to %d records only. Using default sort order (\"latest first\")." % cfg_nb_records_to_sort,"Warning")
return recIDs
recIDs_dict = {}
recIDs_out = []
## first deduce sorting MARC tag out of the 'sort_field' argument:
tags = []
if sort_field and str(sort_field[0:2]).isdigit():
# sort_field starts by two digits, so this is probably a MARC tag already
tags.append(sort_field)
else:
# let us check the 'field' table
query = """SELECT DISTINCT(t.value) FROM tag AS t, field_tag AS ft, field AS f
WHERE f.code='%s' AND ft.id_field=f.id AND t.id=ft.id_tag
ORDER BY ft.score DESC""" % sort_field
res = run_sql(query)
if res:
for row in res:
tags.append(row[0])
else:
print_warning(req, "Sorry, '%s' does not seem to be a valid sort option. Choosing title sort instead." % sort_field, "Error")
tags.append("245__a")
## check if we have sorting tag defined:
if tags:
# fetch the necessary field values:
for recID in recIDs:
val = "" # will hold value for recID according to which sort
vals = [] # will hold all values found in sorting tag for recID
for tag in tags:
vals.extend(get_fieldvalues(recID, tag))
if sort_pattern:
# try to pick that tag value that corresponds to sort pattern
bingo = 0
for v in vals:
if v.startswith(sort_pattern): # bingo!
bingo = 1
val = v
break
if not bingo: # not found, so joint them all together
val = string.join(vals)
else:
# no sort pattern defined, so join them all together