"""Returns first index id where the field code FIELD is indexed.
Returns zero in case there is no table for this index.
Example: field='author', output=4."""
out=0
query="""SELECT w.id FROM idxINDEX AS w, idxINDEX_field AS wf, field AS f
WHERE f.code='%s' AND wf.id_field=f.id AND w.id=wf.id_idxINDEX
LIMIT 1"""%MySQLdb.escape_string(field)
res=run_sql(query,None,1)
ifres:
out=res[0][0]
returnout
defget_words_from_pattern(pattern):
"Returns list of whitespace-separated words from pattern."
words={}
forwordinsplit(pattern):
ifnotwords.has_key(word):
words[word]=1;
returnwords.keys()
defcreate_basic_search_units(req,p,f,m=None):
"""Splits search pattern and search field into a list of independently searchable units.
- A search unit consists of '(operator, pattern, field, type, hitset)' tuples where
'operator' is set union (|), set intersection (+) or set exclusion (-);
'pattern' is either a word (e.g. muon*) or a phrase (e.g. 'nuclear physics');
'field' is either a code like 'title' or MARC tag like '100__a';
'type' is the search type ('w' for word file search, 'a' for access file search).
- Optionally, the function accepts the match type argument 'm'.
If it is set (e.g. from advanced search interface), then it
performs this kind of matching. If it is not set, then a guess is made.
'm' can have values: 'a'='all of the words', 'o'='any of the words',
'p'='phrase/substring', 'r'='regular expression',
'e'='exact value'."""
opfts=[]# will hold (o,p,f,t,h) units
## check arguments: if matching type phrase/string/regexp, do we have field defined?
if(m=='p'orm=='r'orm=='e')andnotf:
m='a'
print_warning(req,"This matching type cannot be used within <em>any field</em>. I will perform a word search instead.")
print_warning(req,"If you want to phrase/substring/regexp search in a specific field, e.g. inside title, then please choose <em>within title</em> search option.")
## is desired matching type set?
ifm:
## A - matching type is known; good!
ifm=='e':
# A1 - exact value:
opfts.append(['|',p,f,'a'])# '|' since we have only one unit
elifm=='p':
# A2 - phrase/substring:
opfts.append(['|',"%"+p+"%",f,'a'])# '|' since we have only one unit
elifm=='r':
# A3 - regular expression:
opfts.append(['|',p,f,'r'])# '|' since we have only one unit
elifm=='a'orm=='w':
# A4 - all of the words:
p=strip_accents(p)# strip accents for 'w' mode, FIXME: delete when not needed
forwordinget_words_from_pattern(p):
iflen(opfts)==0:
opfts.append(['|',word,f,'w'])# '|' in the first unit
else:
opfts.append(['+',word,f,'w'])# '+' in further units
elifm=='o':
# A5 - any of the words:
p=strip_accents(p)# strip accents for 'w' mode, FIXME: delete when not needed
forwordinget_words_from_pattern(p):
opfts.append(['|',word,f,'w'])# '|' in all units
else:
print_warning(req,"Matching type '%s' is not implemented yet."%m,"Warning")
opfts.append(['|',"%"+p+"%",f,'a'])
else:
## B - matching type is not known: let us try to determine it by some heuristics
iffandp[0]=='"'andp[-1]=='"':
## B0 - does 'p' start and end by double quote, and is 'f' defined? => doing ACC search
opfts.append(['|',p[1:-1],f,'a'])
eliffandp[0]=="'"andp[-1]=="'":
## B0bis - does 'p' start and end by single quote, and is 'f' defined? => doing ACC search
opfts.append(['|','%'+p[1:-1]+'%',f,'a'])
eliffandp[0]=="/"andp[-1]=="/":
## B0ter - does 'p' start and end by a slash, and is 'f' defined? => doing regexp search
opfts.append(['|',p[1:-1],f,'r'])
eliffandstring.find(p,',')>=0:
## B1 - does 'p' contain comma, and is 'f' defined? => doing ACC search
opfts.append(['|',p,f,'a'])
eliffandstr(f[0:2]).isdigit():
## B2 - does 'f' exist and starts by two digits? => doing ACC search
opfts.append(['|',p,f,'a'])
else:
## B3 - doing WRD search, but maybe ACC too
# search units are separated by spaces unless the space is within single or double quotes
# so, let us replace temporarily any space within quotes by '__SPACE__'
# B3a - quotes are found => do ACC search (phrase search)
iffi:
ifsre_doublequote.match(pi):
pi=string.replace(pi,'"','')# get rid of quotes
opfts.append([oi,pi,fi,'a'])
else:
pi=string.replace(pi,"'",'')# get rid of quotes
opfts.append([oi,"%"+pi+"%",fi,'a'])
else:
# fi is not defined, look at where we are doing exact or subphrase search (single/double quotes):
ifpi[0]=='"'andpi[-1]=='"':
opfts.append([oi,pi[1:-1],"anyfield",'a'])
print_warning(req,"Searching for an exact match inside any field may be slow. You may want to search for words instead, or choose to search within specific field.")
else:
# nope, subphrase in global index is not possible => change back to WRD search
pi=strip_accents(pi)# strip accents for 'w' mode, FIXME: delete when not needed
forpiiinget_words_from_pattern(pi):
# since there may be '-' and other chars that we do not index in WRD
opfts.append([oi,pii,fi,'w'])
print_warning(req,"The partial phrase search does not work in any field. I'll do a boolean AND searching instead.")
print_warning(req,"If you want to do a partial phrase search in a specific field, e.g. inside title, then please choose 'within title' search option.","Tip")
print_warning(req,"If you want to do exact phrase matching, then please use double quotes.","Tip")
# speed up HitSet operations by ~20% if Psyco is installed:
try:
importpsyco
psyco.bind(HitSet)
except:
pass
defescape_string(s):
"Escapes special chars in string. For MySQL queries."
s=MySQLdb.escape_string(s)
returns
defwash_colls(cc,c,split_colls=0):
"""Wash collection list by checking whether user has deselected
anything under 'Narrow search'. Checks also if cc is a list or not.
Return list of cc, colls_to_display, colls_to_search since the list
of collections to display is different from that to search in.
This is because users might have chosen 'split by collection'
functionality.
The behaviour of "collections to display" depends solely whether
user has deselected a particular collection: e.g. if it started
from 'Articles and Preprints' page, and deselected 'Preprints',
then collection to display is 'Articles'. If he did not deselect
anything, then collection to display is 'Articles & Preprints'.
The behaviour of "collections to search in" depends on the
'split_colls' parameter:
* if is equal to 1, then we can wash the colls list down
and search solely in the collection the user started from;
* if is equal to 0, then we are splitting to the first level
of collections, i.e. collections as they appear on the page
we started to search from;
"""
colls_out=[]
colls_out_for_display=[]
# check what type is 'cc':
iftype(cc)islist:
forciincc:
ifcollection_reclist_cache.has_key(ci):
# yes this collection is real, so use it:
cc=ci
break
else:
# check once if cc is real:
ifnotcollection_reclist_cache.has_key(cc):
cc=cdsname# cc is not real, so replace it with Home collection
# check type of 'c' argument:
iftype(c)islist:
colls=c
else:
colls=[c]
# remove all 'unreal' collections:
colls_real=[]
forcollincolls:
ifcollection_reclist_cache.has_key(coll):
colls_real.append(coll)
colls=colls_real
# check if some real collections remain:
iflen(colls)==0:
colls=[cc]
# then let us check the list of non-restricted "real" sons of 'cc' and compare it to 'coll':
query="SELECT c.name FROM collection AS c, collection_collection AS cc, collection AS ccc WHERE c.id=cc.id_son AND cc.id_dad=ccc.id AND ccc.name='%s' AND cc.type='r' AND c.restricted IS NULL"%MySQLdb.escape_string(cc)
res=run_sql(query)
l_cc_nonrestricted_sons=[]
l_c=colls
forrowinres:
l_cc_nonrestricted_sons.append(row[0])
l_c.sort()
l_cc_nonrestricted_sons.sort()
ifl_cc_nonrestricted_sons==l_c:
colls_out_for_display=[cc]# yep, washing permitted, it is sufficient to display 'cc'
else:
colls_out_for_display=colls# nope, we need to display all 'colls' successively
"""Sort records in 'recIDs' list according sort field 'sort_field' in order 'sort_order'.
If more than one instance of 'sort_field' is found for a given record, try to choose that that is given by
'sort pattern', for example "sort by report number that starts by CERN-PS".
Note that 'sort_field' can be field code like 'author' or MARC tag like '100__a' directly."""
## check arguments:
ifnotsort_field:
returnrecIDs
iflen(recIDs)>cfg_nb_records_to_sort:
print_warning(req,"Sorry, sorting is allowed on sets of up to %d records only. Using default sort order (\"latest first\")."%cfg_nb_records_to_sort,"Warning")
returnrecIDs
sort_fields=string.split(sort_field,",")
recIDs_dict={}
recIDs_out=[]
## first deduce sorting MARC tag out of the 'sort_field' argument:
tags=[]
forsort_fieldinsort_fields:
ifsort_fieldandstr(sort_field[0:2]).isdigit():
# sort_field starts by two digits, so this is probably a MARC tag already
tags.append(sort_field)
else:
# let us check the 'field' table
query="""SELECT DISTINCT(t.value) FROM tag AS t, field_tag AS ft, field AS f
WHERE f.code='%s' AND ft.id_field=f.id AND t.id=ft.id_tag
ORDER BY ft.score DESC"""%sort_field
res=run_sql(query)
ifres:
forrowinres:
tags.append(row[0])
else:
print_warning(req,"Sorry, '%s' does not seem to be a valid sort option. Choosing title sort instead."%sort_field,"Error")
tags.append("245__a")
ifverbose>=3:
print_warning(req,"Sorting by tags %s."%tags)
ifsort_pattern:
print_warning(req,"Sorting preferentially by %s."%sort_pattern)
## check if we have sorting tag defined:
iftags:
# fetch the necessary field values:
forrecIDinrecIDs:
val=""# will hold value for recID according to which sort
vals=[]# will hold all values found in sorting tag for recID
fortagintags:
vals.extend(get_fieldvalues(recID,tag))
ifsort_pattern:
# try to pick that tag value that corresponds to sort pattern
bingo=0
forvinvals:
ifv.startswith(sort_pattern):# bingo!
bingo=1
val=v
break
ifnotbingo:# sort_pattern not present, so add other vals after spaces
val=sort_pattern+" "+string.join(vals)
else:
# no sort pattern defined, so join them all together
out+="<tr><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td></tr>"%("Pattern","Field","Collection","Number of Hits")
req.write("<tr><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td></tr>"%("No.","Time","Pattern","Field","Collection","Number of Hits"))