diff --git a/modules/bibmatch/lib/bibmatch_engine.py b/modules/bibmatch/lib/bibmatch_engine.py index d4cfa9dcb..cacf178de 100644 --- a/modules/bibmatch/lib/bibmatch_engine.py +++ b/modules/bibmatch/lib/bibmatch_engine.py @@ -1,561 +1,561 @@ ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibMatch tool to match records with database content.""" __revision__ = "$Id$" import fileinput import string import os import sys import getopt from invenio.config import \ CFG_BINDIR, \ CFG_VERSION from invenio.search_engine import perform_request_search from invenio.bibrecord import create_records, record_get_field_instances, \ record_get_field_values, record_xml_output from invenio import bibconvert from invenio.dbquery import run_sql def usage(): """Print help""" print >> sys.stderr, \ """ Usage: %s [options] Examples: $ bibmatch [--print-new] --field=\"title\" < input.xml > output.xml $ bibmatch --print-match --field=\"245__a\" --mode=\"a\" < input.xml > output.xml $ bibmatch --print-ambiguous --query-string=\"245__a||100__a\" < input.xml > output.xml $bibmatch [options] < input.xml > output.xml Options: Output: -0 --print-new (default) -1 --print-match -2 --print-ambiguous -b --batch-output=(filename) Simple query: -f --field=(field) Advanced query: -c --config=(config-filename) -q --query-string=(uploader_querystring) -m --mode=(a|e|o|p|r)[3] -o --operator=(a|o)[2] General options: -h, --help print this help and exit -V, --version print version information and exit -v, --verbose=LEVEL verbose level (from 0 to 9, default 1) """ % sys.argv[0] sys.exit(1) return class Querystring: - "Holds the information about querystring (p1,f1,m1,op1,p2,f2,m2,op2,p3,f3,m3,as)." + "Holds the information about querystring (p1,f1,m1,op1,p2,f2,m2,op2,p3,f3,m3,aas)." def __init__(self, mode="1"): """Creates querystring instance""" self.pattern = [] self.field = [] self.mode = [] self.operator = [] self.format = [] self.pattern.append("") self.pattern.append("") self.pattern.append("") self.field.append("") self.field.append("") self.field.append("") self.mode.append("") self.mode.append("") self.mode.append("") self.operator.append("") self.operator.append("") self.format.append([]) self.format.append([]) self.format.append([]) self.advanced = 0 return def from_qrystr(self, qrystr="", search_mode="eee", operator="aa"): """Converts qrystr into querystring (uploader format)""" self.default() self.field = [] self.format = [] self.mode = ["e","e","e"] fields = string.split(qrystr,"||") for field in fields: tags = string.split(field, "::") i = 0 format = [] for tag in tags: if(i==0): self.field.append(tag) else: format.append(tag) i +=1 self.format.append(format) while(len(self.format) < 3): self.format.append("") while(len(self.field) < 3): self.field.append("") i = 0 for lett in search_mode: self.mode[i] = lett i += 1 i = 0 for lett in operator: self.operator[i] = lett i += 1 return def default(self): self.pattern = [] self.field = [] self.mode = [] self.operator = [] self.format = [] self.pattern.append("") self.pattern.append("") self.pattern.append("") self.field.append("245__a") self.field.append("") self.field.append("") self.mode.append("a") self.mode.append("") self.mode.append("") self.operator.append("") self.operator.append("") self.format.append([]) self.format.append([]) self.format.append([]) self.advanced = 1 return def change_search_mode(self, mode="a"): self.mode = [mode,mode,mode] return def search_engine_encode(self): field_ = [] for field in self.field: i = 0 field__ = "" for letter in field: if(letter == "%"): if(i==5): letter = "a" else: letter = "_" i+=1 field__ += str(letter) field_.append(field__) self.field = field_ return def get_field_tags(field): "Gets list of field 'field' for the record with 'sysno' system number from the database." query = "select tag.value from tag left join field_tag on tag.id=field_tag.id_tag left join field on field_tag.id_field=field.id where field.code='%s'" % field; out = [] res = run_sql(query) for row in res: out.append(row[0]) return out def get_subfield(field, subfield): "Return subfield of a field." for sbf in field: if(sbf[0][0][0] == subfield): return sbf[0][0][1] return "" def matched_records(recID_lists): "Analyze list of matches. Ambiguous record result is always preferred." recID_tmp = [] for recID_list in recID_lists: if(len(recID_list) > 1): return 2 if(len(recID_list) == 1): if(len(recID_tmp) == 0): recID_tmp.append(recID_list[0]) else: if(recID_list[0] in recID_tmp): pass else: return 2 if(len(recID_tmp) == 1): return 1 return 0 def matched_records_min(recID_lists): "Analyze lists of matches. New record result is preferred if result is unmatched." min = 2 for recID_list in recID_lists: if(len(recID_list) < min): min = len(recID_list) if(min==1): return min return min def matched_records_max(recID_lists): "Analyze lists of matches. Ambiguous result is preferred if result is unmatched." max = 0 for recID_list in recID_lists: if(len(recID_list) == 1): return 1 if(len(recID_list) > max): max = len(recID_list) if (max > 1): return 2 elif (max == 1): return 1 else: return 0 return 2 def main(): # Record matches database content when defined search gives exactly one record in the result set. # By default the match is done on the title field. # Using advanced search only 3 fields can be queried concurrently # qrystr - querystring in the UpLoader format try: opts, args = getopt.getopt(sys.argv[1:],"012hVm:f:q:c:nv:o:b:", [ "print-new", "print-match", "print-ambiguous", "help", "version", "mode=", "field=", "query-string=", "config=", "no-process", "verbose=", "operator=", "batch-output=" ]) except getopt.GetoptError, e: usage() recs_out = [] recID_list = [] recID_lists = [] qrystrs = [] match_mode = 0 # default match mode to print new records rec_new = 0 # indicator that record is new rec_match = 0 # indicator that record is matched matched = 0 # number of records matched record_counter = 0 # number of records processed noprocess = 0 result = [0,0,0] perform_request_search_mode = "eee" operator = "aa" verbose = 1 # 0..be quiet level = 1 # 1..exact match file_read = "" records = [] batch_output = "" predefined_fields = ["title", "author"] for opt, opt_value in opts: if opt in ["-0", "--print-new"]: match_mode = 0 if opt in ["-1", "--print-match"]: match_mode = 1 if opt in ["-2", "--print-ambiguous"]: match_mode = 2 if opt in ["-n", "--no-process"]: noprocess = 1 if opt in ["-h", "--help"]: usage() sys.exit(0) if opt in ["-V", "--version"]: print __revision__ sys.exit(0) if opt in ["-v", "--verbose"]: verbose = int(opt_value) if opt in ["-q", "--query-string"]: qrystrs.append(opt_value) if opt in ["-m", "--mode"]: perform_request_search_mode = opt_value if opt in ["-o", "--operator"]: operator = opt_value if opt in ["-b", "--batch-output"]: batch_output = opt_value if opt in ["-f", "--field"]: alternate_querystring = [] if opt_value in predefined_fields: alternate_querystring = get_field_tags(opt_value) for item in alternate_querystring: qrystrs.append(item) else: qrystrs.append(opt_value) if opt in ["-c", "--config"]: config_file = opt_value config_file_read = bibconvert.read_file(config_file, 0) for line in config_file_read: tmp = string.split(line, "---") if(tmp[0] == "QRYSTR"): qrystrs.append(tmp[1]) if verbose: sys.stderr.write("\nBibMatch: Parsing input file ... ") for line_in in sys.stdin: file_read += line_in records = create_records(file_read) if len(records) == 0: if verbose: sys.stderr.write("\nBibMatch: Input file contains no records.\n") sys.exit() else: if verbose: sys.stderr.write("read %d records" % len(records)) sys.stderr.write("\nBibMatch: Matching ...") ### Prepare batch output if (batch_output != ""): out_0 = [] out_1 = [] out_2 = [] for rec in records: ### for each query-string record_counter += 1 if (verbose > 1): sys.stderr.write("\n Processing record: #%d .." % record_counter) recID_lists = [] if(len(qrystrs)==0): qrystrs.append("") more_detailed_info = "" for qrystr in qrystrs: querystring = Querystring() querystring.default() if(qrystr != ""): querystring.from_qrystr(qrystr, perform_request_search_mode, operator) else: querystring.default() ### search engine qrystr encode querystring.search_engine_encode() ### get field values inst = [] ### get appropriate corresponding fields from database i = 0 for field in querystring.field: ### use expanded tags tag = field[0:3] ind1 = field[3:4] ind2 = field[4:5] code = field[5:6] if((ind1 == "_")or(ind1 == "%")): ind1 = "" if((ind2 == "_")or(ind2 == "%")): ind2 = "" if((code == "_")or(code == "%")): code = "a" if(field != "001"): sbf = get_subfield(record_get_field_instances(rec[0], tag, ind1, ind2), code) inst.append(sbf) elif(field in ["001"]): sbf = record_get_field_values(rec[0], field, ind1="", ind2="", code="") inst.append(sbf) else: inst.append("") i += 1 ### format acquired field values i = 0 for instance in inst: for format in querystring.format[i]: inst[i] = bibconvert.FormatField(inst[i],format) i += 1 ### perform sensible request search only if(inst[0]!=""): recID_list = perform_request_search( p1=inst[0], f1=querystring.field[0], m1=querystring.mode[0], op1=querystring.operator[0], p2=inst[1], f2=querystring.field[1], m2=querystring.mode[1], op2=querystring.operator[1], - p3=inst[2], f3=querystring.field[2], m3=querystring.mode[2], as=querystring.advanced) + p3=inst[2], f3=querystring.field[2], m3=querystring.mode[2], aas=querystring.advanced) else: recID_list = [] recID_lists.append(recID_list) ### more detailed info ... if(verbose > 8): more_detailed_info = "%s\n Matched recIDs: %s" % (more_detailed_info, recID_lists) if(verbose > 2): more_detailed_info = "%s\n On query: %s, %s, %s, %s\n %s, %s, %s, %s\n %s, %s, %s\n" % (more_detailed_info, inst[0], querystring.field[0], querystring.mode[0], querystring.operator[0], inst[1], querystring.field[1], querystring.mode[1], querystring.operator[1], inst[2], querystring.field[2], querystring.mode[2]) ### for multitagged fields (e.g. title), unmatched result corresponds to the item in extreme rec_match = matched_records_max(recID_lists) ### print-new if (rec_match==0): result[0] += 1 if(match_mode==0): recs_out.append(rec) if (batch_output != ""): out_0.append(rec) if verbose: sys.stderr.write(".") if (verbose > 1): sys.stderr.write("NEW") ### print-match elif (rec_match <= level): result[1] += 1 if(match_mode==1): recs_out.append(rec) if (batch_output != ""): out_1.append(rec) if verbose: sys.stderr.write(".") if (verbose > 1): sys.stderr.write("MATCH") ### print-ambiguous elif(rec_match > level): result[2] += 1 if(match_mode==2): recs_out.append(rec) if (batch_output != ""): out_2.append(rec) if verbose: sys.stderr.write(".") if (verbose > 1): sys.stderr.write("AMBIGUOUS") else: pass sys.stderr.write(more_detailed_info) if verbose: sys.stderr.write("\n\n Bibmatch report\n") sys.stderr.write("=" * 35) sys.stderr.write("\n New records : %d" % result[0]) sys.stderr.write("\n Matched records : %d" % result[1]) sys.stderr.write("\n Ambiguous records : %d\n" % result[2]) sys.stderr.write("=" * 35) sys.stderr.write("\n Total records : %d\n" % record_counter) if noprocess: pass else: for record in recs_out: print record_xml_output(record[0]) if (batch_output != ""): filename = "%s.0" % batch_output file_0 = open(filename,"w") filename = "%s.1" % batch_output file_1 = open(filename,"w") filename = "%s.2" % batch_output file_2 = open(filename,"w") for record in out_0: file_0.write(record_xml_output(record[0])) for record in out_1: file_1.write(record_xml_output(record[0])) for record in out_2: file_2.write(record_xml_output(record[0])) file_0.close() file_1.close() file_2.close() diff --git a/modules/webalert/lib/alert_engine.py b/modules/webalert/lib/alert_engine.py index c11a4bca7..27b00ca8e 100644 --- a/modules/webalert/lib/alert_engine.py +++ b/modules/webalert/lib/alert_engine.py @@ -1,349 +1,348 @@ # -*- coding: utf-8 -*- ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Alert engine implementation.""" ## rest of the Python code goes below __revision__ = "$Id$" from cgi import parse_qs from re import search, sub from time import strftime import datetime from invenio.config import \ CFG_LOGDIR, \ CFG_SITE_SUPPORT_EMAIL, \ CFG_SITE_URL, \ CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES, \ CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES from invenio.webbasket_dblayer import get_basket_owner_id, add_to_basket from invenio.search_engine import perform_request_search from invenio.webinterface_handler import wash_urlargd from invenio.dbquery import run_sql from invenio.webuser import get_email from invenio.mailutils import send_email from invenio.errorlib import register_exception from invenio.alert_engine_config import CFG_WEBALERT_DEBUG_LEVEL import invenio.template websearch_templates = invenio.template.load('websearch') webalert_templates = invenio.template.load('webalert') def update_date_lastrun(alert): return run_sql('update user_query_basket set date_lastrun=%s where id_user=%s and id_query=%s and id_basket=%s;', (strftime("%Y-%m-%d"), alert[0], alert[1], alert[2],)) def get_alert_queries(frequency): return run_sql('select distinct id, urlargs from query q, user_query_basket uqb where q.id=uqb.id_query and uqb.frequency=%s and uqb.date_lastrun <= now();', (frequency,)) def get_alert_queries_for_user(uid): return run_sql('select distinct id, urlargs, uqb.frequency from query q, user_query_basket uqb where q.id=uqb.id_query and uqb.id_user=%s and uqb.date_lastrun <= now();', (uid,)) def get_alerts(query, frequency): r = run_sql('select id_user, id_query, id_basket, frequency, date_lastrun, alert_name, notification from user_query_basket where id_query=%s and frequency=%s;', (query['id_query'], frequency,)) return {'alerts': r, 'records': query['records'], 'argstr': query['argstr'], 'date_from': query['date_from'], 'date_until': query['date_until']} # Optimized version: def add_records_to_basket(record_ids, basket_id): nrec = len(record_ids) if nrec > 0: if CFG_WEBALERT_DEBUG_LEVEL > 0: print "-> adding %s records into basket %s: %s" % (nrec, basket_id, record_ids) try: if CFG_WEBALERT_DEBUG_LEVEL < 4: owner_uid = get_basket_owner_id(basket_id) add_to_basket(owner_uid, record_ids, [basket_id]) else: print ' NOT ADDED, DEBUG LEVEL == 4' except Exception: register_exception() def get_query(alert_id): r = run_sql('select urlargs from query where id=%s', (alert_id,)) return r[0][0] def email_notify(alert, records, argstr): if len(records) == 0: return msg = "" if CFG_WEBALERT_DEBUG_LEVEL > 0: msg = "*** THIS MESSAGE WAS SENT IN DEBUG MODE ***\n\n" url = CFG_SITE_URL + "/search?" + argstr # Extract the pattern and catalogue list from the formatted query query = parse_qs(argstr) pattern = query.get('p', [''])[0] catalogues = query.get('c', []) frequency = alert[3] msg += webalert_templates.tmpl_alert_email_body( alert[5], url, records, pattern, catalogues, frequency) email = get_email(alert[0]) if email == 'guest': print "********************************************************************************" print "The following alert was not send, because cannot detect user email address:" print " " + repr(argstr) print "********************************************************************************" return if CFG_WEBALERT_DEBUG_LEVEL > 0: print "********************************************************************************" print msg print "********************************************************************************" if CFG_WEBALERT_DEBUG_LEVEL < 2: send_email(fromaddr=webalert_templates.tmpl_alert_email_from(), toaddr=email, subject=webalert_templates.tmpl_alert_email_title(alert[5]), content=msg, header='', footer='', attempt_times=CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES, attempt_sleeptime=CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES) if CFG_WEBALERT_DEBUG_LEVEL == 4: send_email(fromaddr=webalert_templates.tmpl_alert_email_from(), toaddr=CFG_SITE_SUPPORT_EMAIL, subject=webalert_templates.tmpl_alert_email_title(alert[5]), content=msg, header='', footer='', attempt_times=CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES, attempt_sleeptime=CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES) def get_argument(args, argname): if args.has_key(argname): return args[argname] else: return [] def _date_to_tuple(date): return [int(part) for part in (date.year, date.month, date.day)] def get_record_ids(argstr, date_from, date_until): argd = wash_urlargd(parse_qs(argstr), websearch_templates.search_results_default_urlargd) p = get_argument(argd, 'p') c = get_argument(argd, 'c') cc = get_argument(argd, 'cc') - as = get_argument(argd, 'as') + aas = get_argument(argd, 'aas') f = get_argument(argd, 'f') so = get_argument(argd, 'so') sp = get_argument(argd, 'sp') ot = get_argument(argd, 'ot') - as = get_argument(argd, 'as') p1 = get_argument(argd, 'p1') f1 = get_argument(argd, 'f1') m1 = get_argument(argd, 'm1') op1 = get_argument(argd, 'op1') p2 = get_argument(argd, 'p2') f2 = get_argument(argd, 'f2') m2 = get_argument(argd, 'm2') op2 = get_argument(argd, 'op2') p3 = get_argument(argd, 'p3') f3 = get_argument(argd, 'f3') m3 = get_argument(argd, 'm3') sc = get_argument(argd, 'sc') d1y, d1m, d1d = _date_to_tuple(date_from) d2y, d2m, d2d = _date_to_tuple(date_until) return perform_request_search(of='id', p=p, c=c, cc=cc, f=f, so=so, sp=sp, ot=ot, - as=as, p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, + aas=aas, p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, m2=m2, op2=op2, p3=p3, f3=f3, m3=m3, sc=sc, d1y=d1y, d1m=d1m, d1d=d1d, d2y=d2y, d2m=d2m, d2d=d2d) def get_argument_as_string(argstr, argname): args = parse_qs(argstr) a = get_argument(args, argname) r = '' if len(a): r = a[0] for i in a[1:len(a)]: r += ", %s" % i return r def get_pattern(argstr): return get_argument_as_string(argstr, 'p') def get_catalogue(argstr): return get_argument_as_string(argstr, 'c') def get_catalogue_num(argstr): args = parse_qs(argstr) a = get_argument(args, 'c') return len(a) def run_query(query, frequency, date_until): """Return a dictionary containing the information of the performed query. The information contains the id of the query, the arguments as a string, and the list of found records.""" if frequency == 'day': date_from = date_until - datetime.timedelta(days=1) elif frequency == 'week': date_from = date_until - datetime.timedelta(weeks=1) else: # Months are not an explicit notion of timedelta (it's the # most ambiguous too). So we explicitely take the same day of # the previous month. d, m, y = (date_until.day, date_until.month, date_until.year) m = m - 1 if m == 0: m = 12 y = y - 1 date_from = datetime.date(year=y, month=m, day=d) recs = get_record_ids(query[1], date_from, date_until) n = len(recs) if n: log('query %08s produced %08s records' % (query[0], len(recs))) if CFG_WEBALERT_DEBUG_LEVEL > 2: print "[%s] run query: %s with dates: from=%s, until=%s\n found rec ids: %s" % ( strftime("%c"), query, date_from, date_until, recs) return {'id_query': query[0], 'argstr': query[1], 'records': recs, 'date_from': date_from, 'date_until': date_until} def process_alert_queries(frequency, date): """Run the alerts according to the frequency. Retrieves the queries for which an alert exists, performs it, and processes the corresponding alerts.""" alert_queries = get_alert_queries(frequency) for aq in alert_queries: q = run_query(aq, frequency, date) alerts = get_alerts(q, frequency) process_alerts(alerts) def replace_argument(argstr, argname, argval): """Replace the given date argument value with the new one. If the argument is missing, it is added.""" if search('%s=\d+' % argname, argstr): r = sub('%s=\d+' % argname, '%s=%s' % (argname, argval), argstr) else: r = argstr + '&%s=%s' % (argname, argval) return r def update_arguments(argstr, date_from, date_until): """Replace date arguments in argstr with the ones specified by date_from and date_until. Absent arguments are added.""" d1y, d1m, d1d = _date_to_tuple(date_from) d2y, d2m, d2d = _date_to_tuple(date_until) r = replace_argument(argstr, 'd1y', d1y) r = replace_argument(r, 'd1m', d1m) r = replace_argument(r, 'd1d', d1d) r = replace_argument(r, 'd2y', d2y) r = replace_argument(r, 'd2m', d2m) r = replace_argument(r, 'd2d', d2d) return r def log(msg): try: logfile = open(CFG_LOGDIR + '/alertengine.log', 'a') logfile.write(strftime('%Y%m%d%H%M%S#')) logfile.write(msg + '\n') logfile.close() except Exception: register_exception() def process_alerts(alerts): # TBD: do not generate the email each time, forge it once and then # send it to all appropriate people for a in alerts['alerts']: if alert_use_basket_p(a): add_records_to_basket(alerts['records'], a[2]) if alert_use_notification_p(a): argstr = update_arguments(alerts['argstr'], alerts['date_from'], alerts['date_until']) try: email_notify(a, alerts['records'], argstr) except Exception: # There were troubles sending this alert, so register # this exception and continue with other alerts: register_exception(alert_admin=True, prefix="Error when sending alert %s, %s\n." % \ (repr(a), repr(argstr))) update_date_lastrun(a) def alert_use_basket_p(alert): return alert[2] != 0 def alert_use_notification_p(alert): return alert[6] == 'y' def run_alerts(date): """Run the alerts. First decide which alerts to run according to the current local time, and runs them.""" if date.day == 1: process_alert_queries('month', date) if date.isoweekday() == 1: # first day of the week process_alert_queries('week', date) process_alert_queries('day', date) def process_alert_queries_for_user(uid, date): """Process the alerts for the given user id. All alerts are with reference date set as the current local time.""" alert_queries = get_alert_queries_for_user(uid) print alert_queries for aq in alert_queries: frequency = aq[2] q = run_query(aq, frequency, date) alerts = get_alerts(q, frequency) process_alerts(alerts) diff --git a/modules/websearch/doc/hacking/search-engine-api.webdoc b/modules/websearch/doc/hacking/search-engine-api.webdoc index 9ebbae1da..5a70047fe 100644 --- a/modules/websearch/doc/hacking/search-engine-api.webdoc +++ b/modules/websearch/doc/hacking/search-engine-api.webdoc @@ -1,344 +1,344 @@ ## -*- mode: html; coding: utf-8; -*- ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. <!-- WebDoc-Page-Title: Search Engine API --> <!-- WebDoc-Page-Navtrail: <a class="navtrail" href="<CFG_SITE_URL>/help/hacking">Hacking CDS Invenio</a> > <a class="navtrail" href="search-engine-internals">WebSearch Internals</a> --> <!-- WebDoc-Page-Revision: $Id$ --> <protect> <pre> CDS Invenio Search Engine can be called from within your Python programs via both a high-level and low-level API interface. 1. High-level API Description: The high-level access to the search engine is provided by exactly the same function as called from the web interface when users submit their queries. This should guarantee exactly the same behaviour, and means that you can pass to the high-level API all the arguments as you see them in the URL. There are two things to note: (i) the function does not check for eventual restricted status of the collection, so the restricted collections will be searched without asking for a password; (ii) the output format argument (``of'') should be set to ``id'' (which is the default value) meaning to return list of recIDs. The function returns the list of recIDs in this case. Signature: - def perform_request_search(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=10, sf="", so="d", sp="", rm="", of="id", ot="", as=0, + def perform_request_search(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=10, sf="", so="d", sp="", rm="", of="id", ot="", aas=0, p1="", f1="", m1="", op1="", p2="", f2="", m2="", op2="", p3="", f3="", m3="", sc=0, jrec=0, recid=-1, recidb=-1, sysno="", id=-1, idb=-1, sysnb="", action="", d1y=0, d1m=0, d1d=0, d2y=0, d2m=0, d2d=0, dt="", verbose=0, ap=0, ln=CFG_SITE_LANG, ec=None): """Perform search or browse request, without checking for authentication. Return list of recIDs found, if of=id. Otherwise create web page. The arguments are as follows: req - mod_python Request class instance. cc - current collection (e.g. "ATLAS"). The collection the user started to search/browse from. c - collection list (e.g. ["Theses", "Books"]). The collections user may have selected/deselected when starting to search from 'cc'. ec - external collection list (e.g. ['CiteSeer', 'Google']). The external collections may have been selected/deselected by the user. p - pattern to search for (e.g. "ellis and muon or kaon"). f - field to search within (e.g. "author"). rg - records in groups of (e.g. "10"). Defines how many hits per collection in the search results page are displayed. sf - sort field (e.g. "title"). so - sort order ("a"=ascending, "d"=descending). sp - sort pattern (e.g. "CERN-") -- in case there are more values in a sort field, this argument tells which one to prefer rm - ranking method (e.g. "jif"). Defines whether results should be ranked by some known ranking method. of - output format (e.g. "hb"). Usually starting "h" means HTML output (and "hb" for HTML brief, "hd" for HTML detailed), "x" means XML output, "t" means plain text output, "id" means no output at all but to return list of recIDs found. (Suitable for high-level API.) ot - output only these MARC tags (e.g. "100,700,909C0b"). Useful if only some fields are to be shown in the output, e.g. for library to control some fields. - as - advanced search ("0" means no, "1" means yes). Whether + aas - advanced search ("0" means no, "1" means yes). Whether search was called from within the advanced search interface. p1 - first pattern to search for in the advanced search interface. Much like 'p'. f1 - first field to search within in the advanced search interface. Much like 'f'. m1 - first matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). op1 - first operator, to join the first and the second unit in the advanced search interface. ("a" add, "o" or, "n" not). p2 - second pattern to search for in the advanced search interface. Much like 'p'. f2 - second field to search within in the advanced search interface. Much like 'f'. m2 - second matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). op2 - second operator, to join the second and the third unit in the advanced search interface. ("a" add, "o" or, "n" not). p3 - third pattern to search for in the advanced search interface. Much like 'p'. f3 - third field to search within in the advanced search interface. Much like 'f'. m3 - third matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). sc - split by collection ("0" no, "1" yes). Governs whether we want to present the results in a single huge list, or splitted by collection. jrec - jump to record (e.g. "234"). Used for navigation inside the search results. recid - display record ID (e.g. "20000"). Do not search/browse but go straight away to the Detailed record page for the given recID. recidb - display record ID bis (e.g. "20010"). If greater than 'recid', then display records from recid to recidb. Useful for example for dumping records from the database for reformatting. sysno - display old system SYS number (e.g. ""). If you migrate to CDS Invenio from another system, and store your old SYS call numbers, you can use them instead of recid if you wish so. id - the same as recid, in case recid is not set. For backwards compatibility. idb - the same as recid, in case recidb is not set. For backwards compatibility. sysnb - the same as sysno, in case sysno is not set. For backwards compatibility. action - action to do. "SEARCH" for searching, "Browse" for browsing. Default is to search. d1 - first datetime in full YYYY-mm-dd HH:MM:DD format (e.g. "1998-08-23 12:34:56"). Useful for search limits on creation/modification date (see 'dt' argument below). Note that 'd1' takes precedence over d1y, d1m, d1d if these are defined. d1y - first date's year (e.g. "1998"). Useful for search limits on creation/modification date. d1m - first date's month (e.g. "08"). Useful for search limits on creation/modification date. d1d - first date's day (e.g. "23"). Useful for search limits on creation/modification date. d2 - second datetime in full YYYY-mm-dd HH:MM:DD format (e.g. "1998-09-02 12:34:56"). Useful for search limits on creation/modification date (see 'dt' argument below). Note that 'd2' takes precedence over d2y, d2m, d2d if these are defined. d2y - second date's year (e.g. "1998"). Useful for search limits on creation/modification date. d2m - second date's month (e.g. "09"). Useful for search limits on creation/modification date. d2d - second date's day (e.g. "02"). Useful for search limits on creation/modification date. dt - first and second date's type (e.g. "c"). Specifies whether to search in creation dates ("c") or in modification dates ("m"). When dt is not set and d1* and d2* are set, the default is "c". verbose - verbose level (0=min, 9=max). Useful to print some internal information on the searching process in case something goes wrong. ap - alternative patterns (0=no, 1=yes). In case no exact match is found, the search engine can try alternative patterns e.g. to replace non-alphanumeric characters by a boolean query. ap defines if this is wanted. ln - language of the search interface (e.g. "en"). Useful for internationalization. ec - list of external search engines to search as well (e.g. "SPIRES HEP"). """ Examples: >>> # import the function: >>> from invenio.search_engine import perform_request_search >>> # get all hits in a collection: >>> perform_request_search(cc="ATLAS Communications") >>> # search for the word `of' in Theses and Books: >>> perform_request_search(p="of", c=["Theses","Books"]) >>> # search for `muon or kaon' within title: >>> perform_request_search(p="muon or kaon", f="title") >>> # phrase search (not the quotes): >>> perform_request_search(p='"Ellis, J"', f="author") >>> # regexp search for a system number >>> perform_request_search(p1="^CERN.*2003-001$", f1="reportnumber", m1="r") >>> # moi inside Standards gives no hits... >>> perform_request_search(p="moi", cc="Standards") >>> # but it does if we use alternative patterns: >>> perform_request_search(p="moi", cc="Standards", ap=1) 2. Mid-level API Description: The mid-level API is provided by a search_pattern() function that only searches for the given pattern in the given field according to the given matching pattern. This function does not know anything about collection. The function does not wash its arguments, it expects them to be `clean' already. The pattern is split into `basic search units' for which a boolean query is launched. The function returns an instance of the HitSet class. Note that if you want to obtain the list of recIDs (as with the high-level API), you can invoke the ``tolist()'' method on a hitset. Signature: def search_pattern(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0): """Search for complex pattern 'p' within field 'f' according to matching type 'm'. Return hitset of recIDs. The function uses multi-stage searching algorithm in case of no exact match found. See the Search Internals document for detailed description. The 'ap' argument governs whether an alternative patterns are to be used in case there is no direct hit for (p,f,m). For example, whether to replace non-alphanumeric characters by spaces if it would give some hits. See the Search Internals document for detailed description. (ap=0 forbits the alternative pattern usage, ap=1 permits it.) The 'of' argument governs whether to print or not some information to the user in case of no match found. (Usually it prints the information in case of HTML formats, otherwise it's silent). The 'verbose' argument controls the level of debugging information to be printed (0=least, 9=most). All the parameters are assumed to have been previously washed. This function is suitable as a mid-level API. """ Examples: >>> # import the function: >>> from invenio.search_engine import search_pattern >>> # search for muon or kaon in any field: >>> search_pattern(p="muon or kaon").tolist() >>> # the following finds nothing by default... >>> search_pattern(p="cern-moi").tolist() >>> # ...but it does find something if we allow alternative patterns: >>> search_pattern(p="cern-moi", ap=1).tolist() >>> # wildcard search for a report number: >>> search_pattern(p="CERN-LHC-PROJECT-REPORT-40*", f="reportnumber").tolist() >>> # regexp search for a report number with possible trailing subjects: >>> search_pattern(p="^CERN-LHC-PROJECT-REPORT-40(-|$)", f="reportnumber", m="r").tolist() 3. Low-level API Description: The low-level API is provided by search_unit() function that assumes its arguments to be already the basic search units. Therefore it does not know anything about boolean queries, etc. The function returns an instance of the HitSet class. Note that if you want to obtain the list of recIDs (as with the high-level API), you can invoke the ``tolist()'' method on a hitset. Signature: def search_unit(p, f=None, m=None): """Search for basic search unit defined by pattern 'p' and field 'f' and matching type 'm'. Return hitset of recIDs. All the parameters are assumed to have been previously washed. 'p' is assumed to be already a ``basic search unit'' so that it is searched as such and is not broken up in any way. Only wildcard and span queries are being detected inside 'p'. This function is suitable as a low-level API. """ Examples: >>> # import the function: >>> from invenio.search_engine import search_unit >>> # search moi in any field: >>> search_unit(p="moi").tolist() >>> # this one will not match: >>> search_unit(p="muon or kaon").tolist() >>> # regexp search for a report number with possible trailing subjects: >>> search_unit(p="^CERN-PS-99-037(-|$)", f="reportnumber", m="r").tolist() More entry points may be created, but I think this threesome kind of access to the search engine should cover all your needs. </pre> </protect> diff --git a/modules/websearch/lib/search_engine.py b/modules/websearch/lib/search_engine.py index 750359e87..19c0e0973 100644 --- a/modules/websearch/lib/search_engine.py +++ b/modules/websearch/lib/search_engine.py @@ -1,4532 +1,4532 @@ # -*- coding: utf-8 -*- ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable-msg=C0301 """CDS Invenio Search Engine in mod_python.""" __lastupdated__ = """$Date$""" __revision__ = "$Id$" ## import general modules: import cgi import copy import string import os import re import time import urllib import urlparse import zlib ## import CDS Invenio stuff: from invenio.config import \ CFG_CERN_SITE, \ CFG_OAI_ID_FIELD, \ CFG_WEBCOMMENT_ALLOW_REVIEWS, \ CFG_WEBSEARCH_CALL_BIBFORMAT, \ CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX, \ CFG_WEBSEARCH_FIELDS_CONVERT, \ CFG_WEBSEARCH_NB_RECORDS_TO_SORT, \ CFG_WEBSEARCH_SEARCH_CACHE_SIZE, \ CFG_WEBSEARCH_USE_JSMATH_FOR_FORMATS, \ CFG_WEBSEARCH_USE_ALEPH_SYSNOS, \ CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE, \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, \ CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS, \ CFG_SITE_LANG, \ CFG_SITE_NAME, \ CFG_LOGDIR, \ CFG_SITE_URL from invenio.search_engine_config import InvenioWebSearchUnknownCollectionError from invenio.bibrecord import create_record from invenio.bibrank_record_sorter import get_bibrank_methods, rank_records, is_method_valid from invenio.bibrank_downloads_similarity import register_page_view_event, calculate_reading_similarity_list from invenio.bibindex_engine_stemmer import stem from invenio.bibformat import format_record, format_records, get_output_format_content_type, create_excel from invenio.bibformat_config import CFG_BIBFORMAT_USE_OLD_BIBFORMAT from invenio.bibrank_downloads_grapher import create_download_history_graph_and_box from invenio.data_cacher import DataCacher from invenio.websearch_external_collections import print_external_results_overview, perform_external_collection_search from invenio.access_control_admin import acc_get_action_id from invenio.access_control_config import VIEWRESTRCOLL, \ CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS from invenio.websearchadminlib import get_detailed_page_tabs from invenio.intbitset import intbitset as HitSet from invenio.dbquery import DatabaseError from invenio.access_control_engine import acc_authorize_action from invenio.errorlib import register_exception from invenio.textutils import encode_for_xml import invenio.template webstyle_templates = invenio.template.load('webstyle') webcomment_templates = invenio.template.load('webcomment') from invenio.bibrank_citation_searcher import calculate_cited_by_list, \ calculate_co_cited_with_list, get_records_with_num_cites, get_self_cited_by from invenio.bibrank_citation_grapher import create_citation_history_graph_and_box from invenio.dbquery import run_sql, run_sql_cached, get_table_update_time, Error from invenio.webuser import getUid, collect_user_info from invenio.webpage import pageheaderonly, pagefooteronly, create_error_box from invenio.messages import gettext_set_language from invenio.search_engine_query_parser import SearchQueryParenthesisedParser, \ InvenioWebSearchQueryParserException, SpiresToInvenioSyntaxConverter try: from mod_python import apache except ImportError, e: pass # ignore user personalisation, needed e.g. for command-line try: import invenio.template websearch_templates = invenio.template.load('websearch') except: pass ## global vars: cfg_nb_browse_seen_records = 100 # limit of the number of records to check when browsing certain collection cfg_nicely_ordered_collection_list = 0 # do we propose collection list nicely ordered or alphabetical? ## precompile some often-used regexp for speed reasons: re_word = re.compile('[\s]') re_quotes = re.compile('[\'\"]') re_doublequote = re.compile('\"') re_equal = re.compile('\=') re_logical_and = re.compile('\sand\s', re.I) re_logical_or = re.compile('\sor\s', re.I) re_logical_not = re.compile('\snot\s', re.I) re_operators = re.compile(r'\s([\+\-\|])\s') re_pattern_wildcards_at_beginning = re.compile(r'(\s)[\*\%]+') re_pattern_single_quotes = re.compile("'(.*?)'") re_pattern_double_quotes = re.compile("\"(.*?)\"") re_pattern_regexp_quotes = re.compile("\/(.*?)\/") re_pattern_short_words = re.compile(r'([\s\"]\w{1,3})[\*\%]+') re_pattern_space = re.compile("__SPACE__") re_pattern_today = re.compile("\$TODAY\$") re_pattern_parens = re.compile(r'\([^\)]+\s+[^\)]+\)') re_unicode_lowercase_a = re.compile(unicode(r"(?u)[áàäâãå]", "utf-8")) re_unicode_lowercase_ae = re.compile(unicode(r"(?u)[æ]", "utf-8")) re_unicode_lowercase_e = re.compile(unicode(r"(?u)[éèëê]", "utf-8")) re_unicode_lowercase_i = re.compile(unicode(r"(?u)[íìïî]", "utf-8")) re_unicode_lowercase_o = re.compile(unicode(r"(?u)[óòöôõø]", "utf-8")) re_unicode_lowercase_u = re.compile(unicode(r"(?u)[úùüû]", "utf-8")) re_unicode_lowercase_y = re.compile(unicode(r"(?u)[ýÿ]", "utf-8")) re_unicode_lowercase_c = re.compile(unicode(r"(?u)[çć]", "utf-8")) re_unicode_lowercase_n = re.compile(unicode(r"(?u)[ñ]", "utf-8")) re_unicode_uppercase_a = re.compile(unicode(r"(?u)[ÁÀÄÂÃÅ]", "utf-8")) re_unicode_uppercase_ae = re.compile(unicode(r"(?u)[Æ]", "utf-8")) re_unicode_uppercase_e = re.compile(unicode(r"(?u)[ÉÈËÊ]", "utf-8")) re_unicode_uppercase_i = re.compile(unicode(r"(?u)[ÍÌÏÎ]", "utf-8")) re_unicode_uppercase_o = re.compile(unicode(r"(?u)[ÓÒÖÔÕØ]", "utf-8")) re_unicode_uppercase_u = re.compile(unicode(r"(?u)[ÚÙÜÛ]", "utf-8")) re_unicode_uppercase_y = re.compile(unicode(r"(?u)[Ý]", "utf-8")) re_unicode_uppercase_c = re.compile(unicode(r"(?u)[ÇĆ]", "utf-8")) re_unicode_uppercase_n = re.compile(unicode(r"(?u)[Ñ]", "utf-8")) re_latex_lowercase_a = re.compile("\\\\[\"H'`~^vu=k]\{?a\}?") re_latex_lowercase_ae = re.compile("\\\\ae\\{\\}?") re_latex_lowercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?e\\}?") re_latex_lowercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?i\\}?") re_latex_lowercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?o\\}?") re_latex_lowercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?u\\}?") re_latex_lowercase_y = re.compile("\\\\[\"']\\{?y\\}?") re_latex_lowercase_c = re.compile("\\\\['uc]\\{?c\\}?") re_latex_lowercase_n = re.compile("\\\\[c'~^vu]\\{?n\\}?") re_latex_uppercase_a = re.compile("\\\\[\"H'`~^vu=k]\\{?A\\}?") re_latex_uppercase_ae = re.compile("\\\\AE\\{?\\}?") re_latex_uppercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?E\\}?") re_latex_uppercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?I\\}?") re_latex_uppercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?O\\}?") re_latex_uppercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?U\\}?") re_latex_uppercase_y = re.compile("\\\\[\"']\\{?Y\\}?") re_latex_uppercase_c = re.compile("\\\\['uc]\\{?C\\}?") re_latex_uppercase_n = re.compile("\\\\[c'~^vu]\\{?N\\}?") class RestrictedCollectionDataCacher(DataCacher): def __init__(self): def cache_filler(): ret = [] try: viewcollid = acc_get_action_id(VIEWRESTRCOLL) res = run_sql("""SELECT DISTINCT ar.value FROM accROLE_accACTION_accARGUMENT raa JOIN accARGUMENT ar ON raa.id_accARGUMENT = ar.id WHERE ar.keyword = 'collection' AND raa.id_accACTION = %s""", (viewcollid,)) except Exception: # database problems, return empty cache return [] for coll in res: ret.append(coll[0]) return ret def timestamp_verifier(): return max(get_table_update_time('accROLE_accACTION_accARGUMENT'), get_table_update_time('accARGUMENT')) DataCacher.__init__(self, cache_filler, timestamp_verifier) def collection_restricted_p(collection): restricted_collection_cache.recreate_cache_if_needed() return collection in restricted_collection_cache.cache try: restricted_collection_cache.is_ok_p except Exception: restricted_collection_cache = RestrictedCollectionDataCacher() def get_permitted_restricted_collections(user_info): """Return a list of collection that are restricted but for which the user is authorized.""" restricted_collection_cache.recreate_cache_if_needed() ret = [] for collection in restricted_collection_cache.cache: if acc_authorize_action(user_info, 'viewrestrcoll', collection=collection)[0] == 0: ret.append(collection) return ret def is_user_owner_of_record(user_info, recid): """ Check if the user is owner of the record, i.e. he is the submitter and/or belongs to a owner-like group authorized to 'see' the record. @param user_info: the user_info dictionary that describe the user. @type user_info: user_info dictionary @param recid: the record identifier. @type recid: positive integer @return: True if the user is 'owner' of the record; False otherwise @rtype: bool """ authorized_emails_or_group = [] for tag in CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS: authorized_emails_or_group.extend(get_fieldvalues(recid, tag)) for email_or_group in authorized_emails_or_group: if email_or_group in user_info['group']: return True email = email_or_group.strip().lower() if user_info['email'].strip().lower() == email: return True return False def check_user_can_view_record(user_info, recid): """ Check if the user is authorized to view the given recid. The function grants access in two cases: either user has author rights on this record, or he has view rights to the primary collection this record belongs to. @param user_info: the user_info dictionary that describe the user. @type user_info: user_info dictionary @param recid: the record identifier. @type recid: positive integer @return: (0, ''), when authorization is granted, (>0, 'message') when authorization is not granted @rtype: (int, string) """ record_primary_collection = guess_primary_collection_of_a_record(recid) if collection_restricted_p(record_primary_collection): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=record_primary_collection) if auth_code == 0 or is_user_owner_of_record(user_info, recid): return (0, '') else: return (auth_code, auth_msg) else: return (0, '') class IndexStemmingDataCacher(DataCacher): """ Provides cache for stemming information for word/phrase indexes. This class is not to be used directly; use function get_index_stemming_language() instead. """ def __init__(self): def cache_filler(): try: res = run_sql("""SELECT id, stemming_language FROM idxINDEX""") except DatabaseError: # database problems, return empty cache return {} return dict(res) def timestamp_verifier(): return get_table_update_time('idxINDEX') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: index_stemming_cache.is_ok_p except Exception: index_stemming_cache = IndexStemmingDataCacher() def get_index_stemming_language(index_id): """Return stemming langugage for given index.""" index_stemming_cache.recreate_cache_if_needed() return index_stemming_cache.cache[index_id] class CollectionRecListDataCacher(DataCacher): """ Provides cache for collection reclist hitsets. This class is not to be used directly; use function get_collection_reclist() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT name,reclist FROM collection") except Exception: # database problems, return empty cache return {} for name, reclist in res: ret[name] = None # this will be filled later during runtime by calling get_collection_reclist(coll) return ret def timestamp_verifier(): return get_table_update_time('collection') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not collection_reclist_cache.is_ok_p: raise Exception except Exception: collection_reclist_cache = CollectionRecListDataCacher() def get_collection_reclist(coll): """Return hitset of recIDs that belong to the collection 'coll'.""" collection_reclist_cache.recreate_cache_if_needed() if not collection_reclist_cache.cache[coll]: # not yet it the cache, so calculate it and fill the cache: set = HitSet() query = "SELECT nbrecs,reclist FROM collection WHERE name=%s" res = run_sql(query, (coll, ), 1) if res: try: set = HitSet(res[0][1]) except: pass collection_reclist_cache.cache[coll] = set # finally, return reclist: return collection_reclist_cache.cache[coll] class SearchResultsCache(DataCacher): """ Provides temporary lazy cache for Search Results. Useful when users click on `next page'. """ def __init__(self): def cache_filler(): return {} def timestamp_verifier(): return '1970-01-01 00:00:00' # lazy cache is always okay; # its filling is governed by # CFG_WEBSEARCH_SEARCH_CACHE_SIZE DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not search_results_cache.is_ok_p: raise Exception except Exception: search_results_cache = SearchResultsCache() class CollectionI18nNameDataCacher(DataCacher): """ Provides cache for I18N collection names. This class is not to be used directly; use function get_coll_i18nname() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT c.name,cn.ln,cn.value FROM collectionname AS cn, collection AS c WHERE cn.id_collection=c.id AND cn.type='ln'") # ln=long name except Exception: # database problems return {} for c, ln, i18nname in res: if i18nname: if not ret.has_key(c): ret[c] = {} ret[c][ln] = i18nname return ret def timestamp_verifier(): return get_table_update_time('collectionname') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not collection_i18nname_cache.is_ok_p: raise Exception except Exception: collection_i18nname_cache = CollectionI18nNameDataCacher() def get_coll_i18nname(c, ln=CFG_SITE_LANG, verify_cache_timestamp=True): """ Return nicely formatted collection name (of the name type `ln' (=long name)) for collection C in language LN. This function uses collection_i18nname_cache, but it verifies whether the cache is up-to-date first by default. This verification step is performed by checking the DB table update time. So, if you call this function 1000 times, it can get very slow because it will do 1000 table update time verifications, even though collection names change not that often. Hence the parameter VERIFY_CACHE_TIMESTAMP which, when set to False, will assume the cache is already up-to-date. This is useful namely in the generation of collection lists for the search results page. """ if verify_cache_timestamp: collection_i18nname_cache.recreate_cache_if_needed() out = c try: out = collection_i18nname_cache.cache[c][ln] except KeyError: pass # translation in LN does not exist return out class FieldI18nNameDataCacher(DataCacher): """ Provides cache for I18N field names. This class is not to be used directly; use function get_field_i18nname() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT f.name,fn.ln,fn.value FROM fieldname AS fn, field AS f WHERE fn.id_field=f.id AND fn.type='ln'") # ln=long name except Exception: # database problems, return empty cache return {} for f, ln, i18nname in res: if i18nname: if not ret.has_key(f): ret[f] = {} ret[f][ln] = i18nname return ret def timestamp_verifier(): return get_table_update_time('fieldname') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not field_i18nname_cache.is_ok_p: raise Exception except Exception: field_i18nname_cache = FieldI18nNameDataCacher() def get_field_i18nname(f, ln=CFG_SITE_LANG, verify_cache_timestamp=True): """ Return nicely formatted field name (of type 'ln', 'long name') for field F in language LN. If VERIFY_CACHE_TIMESTAMP is set to True, then verify DB timestamp and field I18N name cache timestamp and refresh cache from the DB if needed. Otherwise don't bother checking DB timestamp and return the cached value. (This is useful when get_field_i18nname is called inside a loop.) """ if verify_cache_timestamp: field_i18nname_cache.recreate_cache_if_needed() out = f try: out = field_i18nname_cache.cache[f][ln] except KeyError: pass # translation in LN does not exist return out def get_alphabetically_ordered_collection_list(level=0, ln=CFG_SITE_LANG): """Returns nicely ordered (score respected) list of collections, more exactly list of tuples (collection name, printable collection name). Suitable for create_search_box().""" out = [] res = run_sql_cached("SELECT id,name FROM collection ORDER BY name ASC", affected_tables=['collection',]) for c_id, c_name in res: # make a nice printable name (e.g. truncate c_printable for # long collection names in given language): c_printable_fullname = get_coll_i18nname(c_name, ln, False) c_printable = wash_index_term(c_printable_fullname, 30, False) if c_printable != c_printable_fullname: c_printable = c_printable + "..." if level: c_printable = " " + level * '-' + " " + c_printable out.append([c_name, c_printable]) return out def get_nicely_ordered_collection_list(collid=1, level=0, ln=CFG_SITE_LANG): """Returns nicely ordered (score respected) list of collections, more exactly list of tuples (collection name, printable collection name). Suitable for create_search_box().""" colls_nicely_ordered = [] res = run_sql("""SELECT c.name,cc.id_son FROM collection_collection AS cc, collection AS c WHERE c.id=cc.id_son AND cc.id_dad=%s ORDER BY score DESC""", (collid, )) for c, cid in res: # make a nice printable name (e.g. truncate c_printable for # long collection names in given language): c_printable_fullname = get_coll_i18nname(c, ln, False) c_printable = wash_index_term(c_printable_fullname, 30, False) if c_printable != c_printable_fullname: c_printable = c_printable + "..." if level: c_printable = " " + level * '-' + " " + c_printable colls_nicely_ordered.append([c, c_printable]) colls_nicely_ordered = colls_nicely_ordered + get_nicely_ordered_collection_list(cid, level+1, ln=ln) return colls_nicely_ordered def get_index_id_from_field(field): """ Return index id with name corresponding to FIELD, or the first index id where the logical field code named FIELD is indexed. Return zero in case there is no index defined for this field. Example: field='author', output=4. """ out = 0 if field == '': field = 'global' # empty string field means 'global' index (field 'anyfield') # first look in the index table: res = run_sql("""SELECT id FROM idxINDEX WHERE name=%s""", (field,)) if res: out = res[0][0] return out # not found in the index table, now look in the field table: res = run_sql("""SELECT w.id FROM idxINDEX AS w, idxINDEX_field AS wf, field AS f WHERE f.code=%s AND wf.id_field=f.id AND w.id=wf.id_idxINDEX LIMIT 1""", (field,)) if res: out = res[0][0] return out def get_words_from_pattern(pattern): "Returns list of whitespace-separated words from pattern." words = {} for word in string.split(pattern): if not words.has_key(word): words[word] = 1; return words.keys() def create_basic_search_units(req, p, f, m=None, of='hb'): """Splits search pattern and search field into a list of independently searchable units. - A search unit consists of '(operator, pattern, field, type, hitset)' tuples where 'operator' is set union (|), set intersection (+) or set exclusion (-); 'pattern' is either a word (e.g. muon*) or a phrase (e.g. 'nuclear physics'); 'field' is either a code like 'title' or MARC tag like '100__a'; 'type' is the search type ('w' for word file search, 'a' for access file search). - Optionally, the function accepts the match type argument 'm'. If it is set (e.g. from advanced search interface), then it performs this kind of matching. If it is not set, then a guess is made. 'm' can have values: 'a'='all of the words', 'o'='any of the words', 'p'='phrase/substring', 'r'='regular expression', 'e'='exact value'. - Warnings are printed on req (when not None) in case of HTML output formats.""" opfts = [] # will hold (o,p,f,t,h) units # FIXME: quick hack for the journal index if f == 'journal': opfts.append(['+', p, f, 'w']) return opfts ## check arguments: is desired matching type set? if m: ## A - matching type is known; good! if m == 'e': # A1 - exact value: opfts.append(['+', p, f, 'a']) # '+' since we have only one unit elif m == 'p': # A2 - phrase/substring: opfts.append(['+', "%" + p + "%", f, 'a']) # '+' since we have only one unit elif m == 'r': # A3 - regular expression: opfts.append(['+', p, f, 'r']) # '+' since we have only one unit elif m == 'a' or m == 'w': # A4 - all of the words: p = strip_accents(p) # strip accents for 'w' mode, FIXME: delete when not needed for word in get_words_from_pattern(p): opfts.append(['+', word, f, 'w']) # '+' in all units elif m == 'o': # A5 - any of the words: p = strip_accents(p) # strip accents for 'w' mode, FIXME: delete when not needed for word in get_words_from_pattern(p): if len(opfts)==0: opfts.append(['+', word, f, 'w']) # '+' in the first unit else: opfts.append(['|', word, f, 'w']) # '|' in further units else: if of.startswith("h"): print_warning(req, "Matching type '%s' is not implemented yet." % cgi.escape(m), "Warning") opfts.append(['+', "%" + p + "%", f, 'w']) else: ## B - matching type is not known: let us try to determine it by some heuristics if f and p[0] == '"' and p[-1] == '"': ## B0 - does 'p' start and end by double quote, and is 'f' defined? => doing ACC search opfts.append(['+', p[1:-1], f, 'a']) elif f and p[0] == "'" and p[-1] == "'": ## B0bis - does 'p' start and end by single quote, and is 'f' defined? => doing ACC search opfts.append(['+', '%' + p[1:-1] + '%', f, 'a']) elif f and p[0] == "/" and p[-1] == "/": ## B0ter - does 'p' start and end by a slash, and is 'f' defined? => doing regexp search opfts.append(['+', p[1:-1], f, 'r']) elif f and string.find(p, ',') >= 0: ## B1 - does 'p' contain comma, and is 'f' defined? => doing ACC search opfts.append(['+', p, f, 'a']) elif f and str(f[0:2]).isdigit(): ## B2 - does 'f' exist and starts by two digits? => doing ACC search opfts.append(['+', p, f, 'a']) else: ## B3 - doing WRD search, but maybe ACC too # search units are separated by spaces unless the space is within single or double quotes # so, let us replace temporarily any space within quotes by '__SPACE__' p = re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", p) p = re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", p) p = re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", p) # wash argument: p = re_equal.sub(":", p) p = re_logical_and.sub(" ", p) p = re_logical_or.sub(" |", p) p = re_logical_not.sub(" -", p) p = re_operators.sub(r' \1', p) for pi in string.split(p): # iterate through separated units (or items, as "pi" stands for "p item") pi = re_pattern_space.sub(" ", pi) # replace back '__SPACE__' by ' ' # firstly, determine set operator if pi[0] == '+' or pi[0] == '-' or pi[0] == '|': oi = pi[0] pi = pi[1:] else: # okay, there is no operator, so let us decide what to do by default oi = '+' # by default we are doing set intersection... # secondly, determine search pattern and field: if string.find(pi, ":") > 0: fi, pi = string.split(pi, ":", 1) # test whether fi is a real index code or a MARC-tag defined code: if fi in get_fieldcodes() or '00' <= fi[:2] <= '99': pass else: # it is not, so join it back: fi, pi = f, fi + ":" + pi else: fi, pi = f, pi # look also for old ALEPH field names: if fi and CFG_WEBSEARCH_FIELDS_CONVERT.has_key(string.lower(fi)): fi = CFG_WEBSEARCH_FIELDS_CONVERT[string.lower(fi)] # wash 'pi' argument: if re_quotes.match(pi): # B3a - quotes are found => do ACC search (phrase search) if pi[0] == '"' and pi[-1] == '"': pi = string.replace(pi, '"', '') # remove quote signs opfts.append([oi, pi, fi, 'a']) elif pi[0] == "'" and pi[-1] == "'": pi = string.replace(pi, "'", "") # remove quote signs opfts.append([oi, "%" + pi + "%", fi, 'a']) else: # unbalanced quotes, so fall back to WRD query: opfts.append([oi, pi, fi, 'w']) elif fi and str(fi[0]).isdigit() and str(fi[0]).isdigit(): # B3b - fi exists and starts by two digits => do ACC search opfts.append([oi, pi, fi, 'a']) elif fi and not get_index_id_from_field(fi) and get_field_name(fi): # B3c - logical field fi exists but there is no WRD index for fi => try ACC search opfts.append([oi, pi, fi, 'a']) elif pi.startswith('/') and pi.endswith('/'): # B3d - pi has slashes around => do regexp search opfts.append([oi, pi[1:-1], fi, 'r']) else: # B3e - general case => do WRD search pi = strip_accents(pi) # strip accents for 'w' mode, FIXME: delete when not needed for pii in get_words_from_pattern(pi): opfts.append([oi, pii, fi, 'w']) ## sanity check: for i in range(0, len(opfts)): try: pi = opfts[i][1] if pi == '*': if of.startswith("h"): print_warning(req, "Ignoring standalone wildcard word.", "Warning") del opfts[i] if pi == '' or pi == ' ': fi = opfts[i][2] if fi: if of.startswith("h"): print_warning(req, "Ignoring empty <em>%s</em> search term." % fi, "Warning") del opfts[i] except: pass ## return search units: return opfts -def page_start(req, of, cc, as, ln, uid, title_message=None, +def page_start(req, of, cc, aas, ln, uid, title_message=None, description='', keywords='', recID=-1, tab='', p=''): "Start page according to given output format." _ = gettext_set_language(ln) if not req: return # we were called from CLI if not title_message: title_message = _("Search Results") content_type = get_output_format_content_type(of) if of.startswith('x'): if of == 'xr': # we are doing RSS output req.content_type = "application/rss+xml" req.send_http_header() req.write("""<?xml version="1.0" encoding="UTF-8"?>\n""") else: # we are doing XML output: req.content_type = "text/xml" req.send_http_header() req.write("""<?xml version="1.0" encoding="UTF-8"?>\n""") elif of.startswith('t') or str(of[0:3]).isdigit(): # we are doing plain text output: req.content_type = "text/plain" req.send_http_header() elif of == "id": pass # nothing to do, we shall only return list of recIDs elif content_type == 'text/html': # we are doing HTML output: req.content_type = "text/html" req.send_http_header() if not description: description = "%s %s." % (cc, _("Search Results")) if not keywords: keywords = "%s, WebSearch, %s" % (get_coll_i18nname(CFG_SITE_NAME, ln, False), get_coll_i18nname(cc, ln, False)) ## generate RSS URL: argd = {} if req.args: argd = cgi.parse_qs(req.args) rssurl = websearch_templates.build_rss_url(argd) ## add jsmath if displaying single records (FIXME: find ## eventual better place to this code) if of.lower() in CFG_WEBSEARCH_USE_JSMATH_FOR_FORMATS: metaheaderadd = """ <script type='text/javascript'> jsMath = { Controls: {cookie: {printwarn: 0}} }; </script> <script src='/jsMath/easy/invenio-jsmath.js' type='text/javascript'></script> """ else: metaheaderadd = '' ## generate navtrail: - navtrail = create_navtrail_links(cc, as, ln) + navtrail = create_navtrail_links(cc, aas, ln) if navtrail != '': navtrail += ' > ' if (tab != '' or ((of != '' or of.lower() != 'hd') and of != 'hb')) and \ recID != -1: # If we are not in information tab in HD format, customize # the nav. trail to have a link back to main record. (Due # to the way perform_request_search() works, hb # (lowercase) is equal to hd) navtrail += ' <a class="navtrail" href="%s/record/%s">%s</a>' % \ (CFG_SITE_URL, recID, title_message) if (of != '' or of.lower() != 'hd') and of != 'hb': # Export format_name = of query = "SELECT name FROM format WHERE code=%s" res = run_sql(query, (of,)) if res: format_name = res[0][0] navtrail += ' > ' + format_name else: # Discussion, citations, etc. tabs tab_label = get_detailed_page_tabs(cc, ln=ln)[tab]['label'] navtrail += ' > ' + _(tab_label) else: navtrail += title_message if p: # we are serving search/browse results pages, so insert pattern: navtrail += ": " + cgi.escape(p) title_message = cgi.escape(p) + " - " + title_message ## finally, print page header: req.write(pageheaderonly(req=req, title=title_message, navtrail=navtrail, description=description, keywords=keywords, metaheaderadd=metaheaderadd, uid=uid, language=ln, navmenuid='search', navtrail_append_title_p=0, rssurl=rssurl)) req.write(websearch_templates.tmpl_search_pagestart(ln=ln)) #else: # req.send_http_header() def page_end(req, of="hb", ln=CFG_SITE_LANG): "End page according to given output format: e.g. close XML tags, add HTML footer, etc." if of == "id": return [] # empty recID list if not req: return # we were called from CLI if of.startswith('h'): req.write(websearch_templates.tmpl_search_pageend(ln = ln)) # pagebody end req.write(pagefooteronly(lastupdated=__lastupdated__, language=ln, req=req)) return "\n" def create_page_title_search_pattern_info(p, p1, p2, p3): """Create the search pattern bit for the page <title> web page HTML header. Basically combine p and (p1,p2,p3) together so that the page header may be filled whether we are in the Simple Search or Advanced Search interface contexts.""" out = "" if p: out = p else: out = p1 if p2: out += ' ' + p2 if p3: out += ' ' + p3 return out def create_inputdate_box(name="d1", selected_year=0, selected_month=0, selected_day=0, ln=CFG_SITE_LANG): "Produces 'From Date', 'Until Date' kind of selection box. Suitable for search options." _ = gettext_set_language(ln) box = "" # day box += """<select name="%sd">""" % name box += """<option value="">%s""" % _("any day") for day in range(1, 32): box += """<option value="%02d"%s>%02d""" % (day, is_selected(day, selected_day), day) box += """</select>""" # month box += """<select name="%sm">""" % name box += """<option value="">%s""" % _("any month") for mm, month in [(1, _("January")), (2, _("February")), (3, _("March")), (4, _("April")), \ (5, _("May")), (6, _("June")), (7, _("July")), (8, _("August")), \ (9, _("September")), (10, _("October")), (11, _("November")), (12, _("December"))]: box += """<option value="%02d"%s>%s""" % (mm, is_selected(mm, selected_month), month) box += """</select>""" # year box += """<select name="%sy">""" % name box += """<option value="">%s""" % _("any year") this_year = int(time.strftime("%Y", time.localtime())) for year in range(this_year-20, this_year+1): box += """<option value="%d"%s>%d""" % (year, is_selected(year, selected_year), year) box += """</select>""" return box -def create_search_box(cc, colls, p, f, rg, sf, so, sp, rm, of, ot, as, +def create_search_box(cc, colls, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action=""): """Create search box for 'search again in the results page' functionality.""" # load the right message language _ = gettext_set_language(ln) # some computations cc_intl = get_coll_i18nname(cc, ln, False) cc_colID = get_colID(cc) colls_nicely_ordered = [] if cfg_nicely_ordered_collection_list: colls_nicely_ordered = get_nicely_ordered_collection_list(ln=ln) else: colls_nicely_ordered = get_alphabetically_ordered_collection_list(ln=ln) colls_nice = [] for (cx, cx_printable) in colls_nicely_ordered: if not cx.startswith("Unnamed collection"): colls_nice.append({ 'value' : cx, 'text' : cx_printable }) coll_selects = [] if colls and colls[0] != CFG_SITE_NAME: # some collections are defined, so print these first, and only then print 'add another collection' heading: for c in colls: if c: temp = [] temp.append({ 'value' : CFG_SITE_NAME, 'text' : '*** %s ***' % _("any public collection") }) for val in colls_nice: # print collection: if not cx.startswith("Unnamed collection"): temp.append({ 'value' : val['value'], 'text' : val['text'], 'selected' : (c == re.sub("^[\s\-]*","", val['value'])) }) coll_selects.append(temp) coll_selects.append([{ 'value' : '', 'text' : '*** %s ***' % _("add another collection") }] + colls_nice) else: # we searched in CFG_SITE_NAME, so print 'any public collection' heading coll_selects.append([{ 'value' : CFG_SITE_NAME, 'text' : '*** %s ***' % _("any public collection") }] + colls_nice) ## ranking methods ranks = [{ 'value' : '', 'text' : "- %s %s -" % (_("OR").lower (), _("rank by")), }] for (code, name) in get_bibrank_methods(cc_colID, ln): # propose found rank methods: ranks.append({ 'value' : code, 'text' : name, }) formats = [] query = """SELECT code,name FROM format WHERE visibility='1' ORDER BY name ASC""" res = run_sql(query) if res: # propose found formats: for code, name in res: formats.append({ 'value' : code, 'text' : name }) else: formats.append({'value' : 'hb', 'text' : _("HTML brief") }) # show collections in the search box? (not if there is only one # collection defined, and not if we are in light search) show_colls = True if len(collection_reclist_cache.cache.keys()) == 1 or \ - as == -1: + aas == -1: show_colls = False return websearch_templates.tmpl_search_box( ln = ln, - as = as, + aas = aas, cc_intl = cc_intl, cc = cc, ot = ot, sp = sp, action = action, fieldslist = get_searchwithin_fields(ln=ln, colID=cc_colID), f1 = f1, f2 = f2, f3 = f3, m1 = m1, m2 = m2, m3 = m3, p1 = p1, p2 = p2, p3 = p3, op1 = op1, op2 = op2, rm = rm, p = p, f = f, coll_selects = coll_selects, d1y = d1y, d2y = d2y, d1m = d1m, d2m = d2m, d1d = d1d, d2d = d2d, dt = dt, sort_fields = get_sortby_fields(ln=ln, colID=cc_colID), sf = sf, so = so, ranks = ranks, sc = sc, rg = rg, formats = formats, of = of, pl = pl, jrec = jrec, ec = ec, show_colls = show_colls, ) -def create_navtrail_links(cc=CFG_SITE_NAME, as=0, ln=CFG_SITE_LANG, self_p=1, tab=''): +def create_navtrail_links(cc=CFG_SITE_NAME, aas=0, ln=CFG_SITE_LANG, self_p=1, tab=''): """Creates navigation trail links, i.e. links to collection - ancestors (except Home collection). If as==1, then links to + ancestors (except Home collection). If aas==1, then links to Advanced Search interfaces; otherwise Simple Search. """ dads = [] for dad in get_coll_ancestors(cc): if dad != CFG_SITE_NAME: # exclude Home collection dads.append ((dad, get_coll_i18nname(dad, ln, False))) if self_p and cc != CFG_SITE_NAME: dads.append((cc, get_coll_i18nname(cc, ln, False))) return websearch_templates.tmpl_navtrail_links( - as=as, ln=ln, dads=dads) + aas=aas, ln=ln, dads=dads) def get_searchwithin_fields(ln='en', colID=None): """Retrieves the fields name used in the 'search within' selection box for the collection ID colID.""" res = None if colID: res = run_sql_cached("""SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='sew' AND cff.id_collection=%s AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", (colID,), affected_tables=['field', 'collection_field_fieldvalue']) if not res: res = run_sql_cached("SELECT code,name FROM field ORDER BY name ASC", affected_tables=['field',]) fields = [{ 'value' : '', 'text' : get_field_i18nname("any field", ln, False) }] for field_code, field_name in res: if field_code and field_code != "anyfield": fields.append({ 'value' : field_code, 'text' : get_field_i18nname(field_name, ln, False) }) return fields def get_sortby_fields(ln='en', colID=None): """Retrieves the fields name used in the 'sort by' selection box for the collection ID colID.""" _ = gettext_set_language(ln) res = None if colID: res = run_sql_cached("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='soo' AND cff.id_collection=%s AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", (colID,), affected_tables=['field', 'collection_field_fieldvalue']) if not res: # no sort fields defined for this colID, try to take Home collection: res = run_sql_cached("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='soo' AND cff.id_collection=%s AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", (1,), affected_tables=['field', 'collection_field_fieldvalue']) if not res: # no sort fields defined for the Home collection, take all sort fields defined wherever they are: res = run_sql_cached("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='soo' AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", affected_tables=['field', 'collection_field_fieldvalue']) fields = [{ 'value' : '', 'text' : _("latest first") }] for field_code, field_name in res: if field_code and field_code != "anyfield": fields.append({ 'value' : field_code, 'text' : get_field_i18nname(field_name, ln, False) }) return fields def create_andornot_box(name='op', value='', ln='en'): "Returns HTML code for the AND/OR/NOT selection box." _ = gettext_set_language(ln) out = """ <select name="%s"> <option value="a"%s>%s <option value="o"%s>%s <option value="n"%s>%s </select> """ % (name, is_selected('a', value), _("AND"), is_selected('o', value), _("OR"), is_selected('n', value), _("AND NOT")) return out def create_matchtype_box(name='m', value='', ln='en'): "Returns HTML code for the 'match type' selection box." _ = gettext_set_language(ln) out = """ <select name="%s"> <option value="a"%s>%s <option value="o"%s>%s <option value="e"%s>%s <option value="p"%s>%s <option value="r"%s>%s </select> """ % (name, is_selected('a', value), _("All of the words:"), is_selected('o', value), _("Any of the words:"), is_selected('e', value), _("Exact phrase:"), is_selected('p', value), _("Partial phrase:"), is_selected('r', value), _("Regular expression:")) return out def is_selected(var, fld): "Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes." if type(var) is int and type(fld) is int: if var == fld: return " selected" elif str(var) == str(fld): return " selected" elif fld and len(fld)==3 and fld[0] == "w" and var == fld[1:]: return " selected" return "" def wash_colls(cc, c, split_colls=0): """Wash collection list by checking whether user has deselected anything under 'Narrow search'. Checks also if cc is a list or not. Return list of cc, colls_to_display, colls_to_search since the list of collections to display is different from that to search in. This is because users might have chosen 'split by collection' functionality. The behaviour of "collections to display" depends solely whether user has deselected a particular collection: e.g. if it started from 'Articles and Preprints' page, and deselected 'Preprints', then collection to display is 'Articles'. If he did not deselect anything, then collection to display is 'Articles & Preprints'. The behaviour of "collections to search in" depends on the 'split_colls' parameter: * if is equal to 1, then we can wash the colls list down and search solely in the collection the user started from; * if is equal to 0, then we are splitting to the first level of collections, i.e. collections as they appear on the page we started to search from; The function raises exception InvenioWebSearchUnknownCollectionError if cc or one of c collections is not known. """ colls_out = [] colls_out_for_display = [] # check what type is 'cc': if type(cc) is list: for ci in cc: if collection_reclist_cache.cache.has_key(ci): # yes this collection is real, so use it: cc = ci break else: # check once if cc is real: if not collection_reclist_cache.cache.has_key(cc): if cc: raise InvenioWebSearchUnknownCollectionError(cc) else: cc = CFG_SITE_NAME # cc is not set, so replace it with Home collection # check type of 'c' argument: if type(c) is list: colls = c else: colls = [c] # remove all 'unreal' collections: colls_real = [] for coll in colls: if collection_reclist_cache.cache.has_key(coll): colls_real.append(coll) else: if coll: raise InvenioWebSearchUnknownCollectionError(coll) colls = colls_real # check if some real collections remain: if len(colls)==0: colls = [cc] # then let us check the list of non-restricted "real" sons of 'cc' and compare it to 'coll': res = run_sql("""SELECT c.name FROM collection AS c, collection_collection AS cc, collection AS ccc WHERE c.id=cc.id_son AND cc.id_dad=ccc.id AND ccc.name=%s AND cc.type='r'""", (cc,)) l_cc_nonrestricted_sons = [] l_c = colls for row in res: if not collection_restricted_p(row[0]): l_cc_nonrestricted_sons.append(row[0]) l_c.sort() l_cc_nonrestricted_sons.sort() if l_cc_nonrestricted_sons == l_c: colls_out_for_display = [cc] # yep, washing permitted, it is sufficient to display 'cc' else: colls_out_for_display = colls # nope, we need to display all 'colls' successively # remove duplicates: colls_out_for_display_nondups=filter(lambda x, colls_out_for_display=colls_out_for_display: colls_out_for_display[x-1] not in colls_out_for_display[x:], range(1, len(colls_out_for_display)+1)) colls_out_for_display = map(lambda x, colls_out_for_display=colls_out_for_display:colls_out_for_display[x-1], colls_out_for_display_nondups) # second, let us decide on collection splitting: if split_colls == 0: # type A - no sons are wanted colls_out = colls_out_for_display # elif split_colls == 1: else: # type B - sons (first-level descendants) are wanted for coll in colls_out_for_display: coll_sons = get_coll_sons(coll) if coll_sons == []: colls_out.append(coll) else: colls_out = colls_out + coll_sons # remove duplicates: colls_out_nondups=filter(lambda x, colls_out=colls_out: colls_out[x-1] not in colls_out[x:], range(1, len(colls_out)+1)) colls_out = map(lambda x, colls_out=colls_out:colls_out[x-1], colls_out_nondups) return (cc, colls_out_for_display, colls_out) def strip_accents(x): """Strip accents in the input phrase X (assumed in UTF-8) by replacing accented characters with their unaccented cousins (e.g. é by e). Return such a stripped X.""" x = re_latex_lowercase_a.sub("a", x) x = re_latex_lowercase_ae.sub("ae", x) x = re_latex_lowercase_e.sub("e", x) x = re_latex_lowercase_i.sub("i", x) x = re_latex_lowercase_o.sub("o", x) x = re_latex_lowercase_u.sub("u", x) x = re_latex_lowercase_y.sub("x", x) x = re_latex_lowercase_c.sub("c", x) x = re_latex_lowercase_n.sub("n", x) x = re_latex_uppercase_a.sub("A", x) x = re_latex_uppercase_ae.sub("AE", x) x = re_latex_uppercase_e.sub("E", x) x = re_latex_uppercase_i.sub("I", x) x = re_latex_uppercase_o.sub("O", x) x = re_latex_uppercase_u.sub("U", x) x = re_latex_uppercase_y.sub("Y", x) x = re_latex_uppercase_c.sub("C", x) x = re_latex_uppercase_n.sub("N", x) # convert input into Unicode string: try: y = unicode(x, "utf-8") except: return x # something went wrong, probably the input wasn't UTF-8 # asciify Latin-1 lowercase characters: y = re_unicode_lowercase_a.sub("a", y) y = re_unicode_lowercase_ae.sub("ae", y) y = re_unicode_lowercase_e.sub("e", y) y = re_unicode_lowercase_i.sub("i", y) y = re_unicode_lowercase_o.sub("o", y) y = re_unicode_lowercase_u.sub("u", y) y = re_unicode_lowercase_y.sub("y", y) y = re_unicode_lowercase_c.sub("c", y) y = re_unicode_lowercase_n.sub("n", y) # asciify Latin-1 uppercase characters: y = re_unicode_uppercase_a.sub("A", y) y = re_unicode_uppercase_ae.sub("AE", y) y = re_unicode_uppercase_e.sub("E", y) y = re_unicode_uppercase_i.sub("I", y) y = re_unicode_uppercase_o.sub("O", y) y = re_unicode_uppercase_u.sub("U", y) y = re_unicode_uppercase_y.sub("Y", y) y = re_unicode_uppercase_c.sub("C", y) y = re_unicode_uppercase_n.sub("N", y) # return UTF-8 representation of the Unicode string: return y.encode("utf-8") def wash_index_term(term, max_char_length=50, lower_term=True): """ Return washed form of the index term TERM that would be suitable for storing into idxWORD* tables. I.e., lower the TERM if LOWER_TERM is True, and truncate it safely to MAX_CHAR_LENGTH UTF-8 characters (meaning, in principle, 4*MAX_CHAR_LENGTH bytes). The function works by an internal conversion of TERM, when needed, from its input Python UTF-8 binary string format into Python Unicode format, and then truncating it safely to the given number of UTF-8 characters, without possible mis-truncation in the middle of a multi-byte UTF-8 character that could otherwise happen if we would have been working with UTF-8 binary representation directly. Note that MAX_CHAR_LENGTH corresponds to the length of the term column in idxINDEX* tables. """ if lower_term: washed_term = unicode(term, 'utf-8').lower() else: washed_term = unicode(term, 'utf-8') if len(washed_term) <= max_char_length: # no need to truncate the term, because it will fit # nicely even if it uses four-byte UTF-8 characters return washed_term.encode('utf-8') else: # truncate the term in a safe position: return washed_term[:max_char_length].encode('utf-8') def lower_index_term(term): """ Return safely lowered index term TERM. This is done by converting to UTF-8 first, because standard Python lower() function is not UTF-8 safe. To be called by both the search engine and the indexer when appropriate (e.g. before stemming). In case of problems with UTF-8 compliance, this function raises UnicodeDecodeError, so the client code may want to catch it. """ return unicode(term, 'utf-8').lower().encode('utf-8') def wash_output_format(format): """Wash output format FORMAT. Currently only prevents input like 'of=9' for backwards-compatible format that prints certain fields only. (for this task, 'of=tm' is preferred)""" if str(format[0:3]).isdigit() and len(format) != 6: # asked to print MARC tags, but not enough digits, # so let's switch back to HTML brief default return 'hb' else: return format def wash_pattern(p): """Wash pattern passed by URL. Check for sanity of the wildcard by removing wildcards if they are appended to extremely short words (1-3 letters). TODO: instead of this approximative treatment, it will be much better to introduce a temporal limit, e.g. to kill a query if it does not finish in 10 seconds.""" # strip accents: # p = strip_accents(p) # FIXME: when available, strip accents all the time # add leading/trailing whitespace for the two following wildcard-sanity checking regexps: p = " " + p + " " # get rid of wildcards at the beginning of words: p = re_pattern_wildcards_at_beginning.sub("\\1", p) # replace spaces within quotes by __SPACE__ temporarily: p = re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", p) p = re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", p) p = re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", p) # get rid of extremely short words (1-3 letters with wildcards): p = re_pattern_short_words.sub("\\1", p) # replace back __SPACE__ by spaces: p = re_pattern_space.sub(" ", p) # replace special terms: p = re_pattern_today.sub(time.strftime("%Y-%m-%d", time.localtime()), p) # remove unnecessary whitespace: p = string.strip(p) return p def wash_field(f): """Wash field passed by URL.""" # get rid of unnecessary whitespace: f = string.strip(f) # wash old-style CDS Invenio/ALEPH 'f' field argument, e.g. replaces 'wau' and 'au' by 'author' if CFG_WEBSEARCH_FIELDS_CONVERT.has_key(string.lower(f)): f = CFG_WEBSEARCH_FIELDS_CONVERT[f] return f def wash_dates(d1="", d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0): """ Take user-submitted date arguments D1 (full datetime string) or (D1Y, D1M, D1Y) year, month, day tuple and D2 or (D2Y, D2M, D2Y) and return (YYY1-M1-D2 H1:M1:S2, YYY2-M2-D2 H2:M2:S2) datetime strings in the YYYY-MM-DD HH:MM:SS format suitable for time restricted searching. Note that when both D1 and (D1Y, D1M, D1D) parameters are present, the precedence goes to D1. Ditto for D2*. Note that when (D1Y, D1M, D1D) are taken into account, some values may be missing and are completed e.g. to 01 or 12 according to whether it is the starting or the ending date. """ datetext1, datetext2 = "", "" # sanity checking: if d1 == "" and d1y == 0 and d1m == 0 and d1d == 0 and d2 == "" and d2y == 0 and d2m == 0 and d2d == 0: return ("", "") # nothing selected, so return empty values # wash first (starting) date: if d1: # full datetime string takes precedence: datetext1 = d1 else: # okay, first date passed as (year,month,day): if d1y: datetext1 += "%04d" % d1y else: datetext1 += "0000" if d1m: datetext1 += "-%02d" % d1m else: datetext1 += "-01" if d1d: datetext1 += "-%02d" % d1d else: datetext1 += "-01" datetext1 += " 00:00:00" # wash second (ending) date: if d2: # full datetime string takes precedence: datetext2 = d2 else: # okay, second date passed as (year,month,day): if d2y: datetext2 += "%04d" % d2y else: datetext2 += "9999" if d2m: datetext2 += "-%02d" % d2m else: datetext2 += "-12" if d2d: datetext2 += "-%02d" % d2d else: datetext2 += "-31" # NOTE: perhaps we should add max(datenumber) in # given month, but for our quering it's not # needed, 31 will always do datetext2 += " 00:00:00" # okay, return constructed YYYY-MM-DD HH:MM:SS datetexts: return (datetext1, datetext2) def get_colID(c): "Return collection ID for collection name C. Return None if no match found." colID = None res = run_sql("SELECT id FROM collection WHERE name=%s", (c,), 1) if res: colID = res[0][0] return colID def get_coll_ancestors(coll): "Returns a list of ancestors for collection 'coll'." coll_ancestors = [] coll_ancestor = coll while 1: res = run_sql("""SELECT c.name FROM collection AS c LEFT JOIN collection_collection AS cc ON c.id=cc.id_dad LEFT JOIN collection AS ccc ON ccc.id=cc.id_son WHERE ccc.name=%s ORDER BY cc.id_dad ASC LIMIT 1""", (coll_ancestor,)) if res: coll_name = res[0][0] coll_ancestors.append(coll_name) coll_ancestor = coll_name else: break # ancestors found, return reversed list: coll_ancestors.reverse() return coll_ancestors def get_coll_sons(coll, type='r', public_only=1): """Return a list of sons (first-level descendants) of type 'type' for collection 'coll'. If public_only, then return only non-restricted son collections. """ coll_sons = [] query = "SELECT c.name FROM collection AS c "\ "LEFT JOIN collection_collection AS cc ON c.id=cc.id_son "\ "LEFT JOIN collection AS ccc ON ccc.id=cc.id_dad "\ "WHERE cc.type=%s AND ccc.name=%s" query += " ORDER BY cc.score DESC" res = run_sql(query, (type, coll)) for name in res: if not public_only or not collection_restricted_p(name[0]): coll_sons.append(name[0]) return coll_sons def get_coll_real_descendants(coll): """Return a list of all descendants of collection 'coll' that are defined by a 'dbquery'. IOW, we need to decompose compound collections like "A & B" into "A" and "B" provided that "A & B" has no associated database query defined. """ coll_sons = [] res = run_sql("""SELECT c.name,c.dbquery FROM collection AS c LEFT JOIN collection_collection AS cc ON c.id=cc.id_son LEFT JOIN collection AS ccc ON ccc.id=cc.id_dad WHERE ccc.name=%s ORDER BY cc.score DESC""", (coll,)) for name, dbquery in res: if dbquery: # this is 'real' collection, so return it: coll_sons.append(name) else: # this is 'composed' collection, so recurse: coll_sons.extend(get_coll_real_descendants(name)) return coll_sons def browse_pattern(req, colls, p, f, rg, ln=CFG_SITE_LANG): """Browse either biliographic phrases or words indexes, and display it.""" # load the right message language _ = gettext_set_language(ln) ## is p enclosed in quotes? (coming from exact search) if p.startswith('"') and p.endswith('"'): p = p[1:-1] p_orig = p ## okay, "real browse" follows: ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test if not f and string.find(p, ":") > 0: # does 'p' contain ':'? f, p = string.split(p, ":", 1) ## do we search in words indexes? if not f: return browse_in_bibwords(req, p, f) index_id = get_index_id_from_field(f) if index_id != 0: coll = HitSet() for coll_name in colls: coll |= get_collection_reclist(coll_name) browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection(p, index_id, rg/2, rg/2, coll) else: browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg+1)/2+1, (rg-1)/2+1) while not browsed_phrases: # try again and again with shorter and shorter pattern: try: p = p[:-1] browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg+1)/2+1, (rg-1)/2+1) except: # probably there are no hits at all: req.write(_("No values found.")) return ## try to check hits in these particular collection selection: browsed_phrases_in_colls = [] if 0: for phrase in browsed_phrases: phrase_hitset = HitSet() phrase_hitsets = search_pattern("", phrase, f, 'e') for coll in colls: phrase_hitset.union_update(phrase_hitsets[coll]) if len(phrase_hitset) > 0: # okay, this phrase has some hits in colls, so add it: browsed_phrases_in_colls.append([phrase, len(phrase_hitset)]) ## were there hits in collections? if browsed_phrases_in_colls == []: if browsed_phrases != []: #print_warning(req, """<p>No match close to <em>%s</em> found in given collections. #Please try different term.<p>Displaying matches in any collection...""" % p_orig) ## try to get nbhits for these phrases in any collection: for phrase in browsed_phrases: browsed_phrases_in_colls.append([phrase, get_nbhits_in_bibxxx(phrase, f)]) ## display results now: out = websearch_templates.tmpl_browse_pattern( f=f, fn=get_field_i18nname(get_field_name(f) or f, ln, False), ln=ln, browsed_phrases_in_colls=browsed_phrases_in_colls, colls=colls, rg=rg, ) req.write(out) return def browse_in_bibwords(req, p, f, ln=CFG_SITE_LANG): """Browse inside words indexes.""" if not p: return _ = gettext_set_language(ln) urlargd = {} urlargd.update(req.argd) urlargd['action'] = 'search' nearest_box = create_nearest_terms_box(urlargd, p, f, 'w', ln=ln, intro_text_p=0) req.write(websearch_templates.tmpl_search_in_bibwords( p = p, f = f, ln = ln, nearest_box = nearest_box )) return def search_pattern(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0, ln=CFG_SITE_LANG): """Search for complex pattern 'p' within field 'f' according to matching type 'm'. Return hitset of recIDs. The function uses multi-stage searching algorithm in case of no exact match found. See the Search Internals document for detailed description. The 'ap' argument governs whether an alternative patterns are to be used in case there is no direct hit for (p,f,m). For example, whether to replace non-alphanumeric characters by spaces if it would give some hits. See the Search Internals document for detailed description. (ap=0 forbits the alternative pattern usage, ap=1 permits it.) The 'of' argument governs whether to print or not some information to the user in case of no match found. (Usually it prints the information in case of HTML formats, otherwise it's silent). The 'verbose' argument controls the level of debugging information to be printed (0=least, 9=most). All the parameters are assumed to have been previously washed. This function is suitable as a mid-level API. """ _ = gettext_set_language(ln) hitset_empty = HitSet() # sanity check: if not p: hitset_full = HitSet(trailing_bits=1) hitset_full.discard(0) # no pattern, so return all universe return hitset_full # search stage 1: break up arguments into basic search units: if verbose and of.startswith("h"): t1 = os.times()[4] basic_search_units = create_basic_search_units(req, p, f, m, of) if verbose and of.startswith("h"): t2 = os.times()[4] print_warning(req, "Search stage 1: basic search units are: %s" % cgi.escape(repr(basic_search_units))) print_warning(req, "Search stage 1: execution took %.2f seconds." % (t2 - t1)) # search stage 2: do search for each search unit and verify hit presence: if verbose and of.startswith("h"): t1 = os.times()[4] basic_search_units_hitsets = [] for idx_unit in xrange(len(basic_search_units)): bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit] basic_search_unit_hitset = search_unit(bsu_p, bsu_f, bsu_m) if verbose >= 9 and of.startswith("h"): print_warning(req, "Search stage 1: pattern %s gave hitlist %s" % (cgi.escape(bsu_p), basic_search_unit_hitset)) if len(basic_search_unit_hitset) > 0 or \ ap==0 or \ bsu_o=="|" or \ ((idx_unit+1)<len(basic_search_units) and basic_search_units[idx_unit+1][0]=="|"): # stage 2-1: this basic search unit is retained, since # either the hitset is non-empty, or the approximate # pattern treatment is switched off, or the search unit # was joined by an OR operator to preceding/following # units so we do not require that it exists basic_search_units_hitsets.append(basic_search_unit_hitset) else: # stage 2-2: no hits found for this search unit, try to replace non-alphanumeric chars inside pattern: if re.search(r'[^a-zA-Z0-9\s\:]', bsu_p): if bsu_p.startswith('"') and bsu_p.endswith('"'): # is it ACC query? bsu_pn = re.sub(r'[^a-zA-Z0-9\s\:]+', "*", bsu_p) else: # it is WRD query bsu_pn = re.sub(r'[^a-zA-Z0-9\s\:]+', " ", bsu_p) if verbose and of.startswith('h') and req: print_warning(req, "Trying (%s,%s,%s)" % (cgi.escape(bsu_pn), cgi.escape(bsu_f), cgi.escape(bsu_m))) basic_search_unit_hitset = search_pattern(req=None, p=bsu_pn, f=bsu_f, m=bsu_m, of="id", ln=ln) if len(basic_search_unit_hitset) > 0: # we retain the new unit instead if of.startswith('h'): print_warning(req, _("No exact match found for %(x_query1)s, using %(x_query2)s instead...") % \ {'x_query1': "<em>" + cgi.escape(bsu_p) + "</em>", 'x_query2': "<em>" + cgi.escape(bsu_pn) + "</em>"}) basic_search_units[idx_unit][1] = bsu_pn basic_search_units_hitsets.append(basic_search_unit_hitset) else: # stage 2-3: no hits found either, propose nearest indexed terms: if of.startswith('h'): if req: if bsu_f == "recid": print_warning(req, "Requested record does not seem to exist.") else: print_warning(req, create_nearest_terms_box(req.argd, bsu_p, bsu_f, bsu_m, ln=ln)) return hitset_empty else: # stage 2-3: no hits found either, propose nearest indexed terms: if of.startswith('h'): if req: if bsu_f == "recid": print_warning(req, "Requested record does not seem to exist.") else: print_warning(req, create_nearest_terms_box(req.argd, bsu_p, bsu_f, bsu_m, ln=ln)) return hitset_empty if verbose and of.startswith("h"): t2 = os.times()[4] for idx_unit in range(0, len(basic_search_units)): print_warning(req, "Search stage 2: basic search unit %s gave %d hits." % (basic_search_units[idx_unit][1:], len(basic_search_units_hitsets[idx_unit]))) print_warning(req, "Search stage 2: execution took %.2f seconds." % (t2 - t1)) # search stage 3: apply boolean query for each search unit: if verbose and of.startswith("h"): t1 = os.times()[4] # let the initial set be the complete universe: hitset_in_any_collection = HitSet(trailing_bits=1) hitset_in_any_collection.discard(0) for idx_unit in xrange(len(basic_search_units)): this_unit_operation = basic_search_units[idx_unit][0] this_unit_hitset = basic_search_units_hitsets[idx_unit] if this_unit_operation == '+': hitset_in_any_collection.intersection_update(this_unit_hitset) elif this_unit_operation == '-': hitset_in_any_collection.difference_update(this_unit_hitset) elif this_unit_operation == '|': hitset_in_any_collection.union_update(this_unit_hitset) else: if of.startswith("h"): print_warning(req, "Invalid set operation %s." % cgi.escape(this_unit_operation), "Error") if len(hitset_in_any_collection) == 0: # no hits found, propose alternative boolean query: if of.startswith('h'): nearestterms = [] for idx_unit in range(0, len(basic_search_units)): bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit] if bsu_p.startswith("%") and bsu_p.endswith("%"): bsu_p = "'" + bsu_p[1:-1] + "'" bsu_nbhits = len(basic_search_units_hitsets[idx_unit]) # create a similar query, but with the basic search unit only argd = {} argd.update(req.argd) argd['p'] = bsu_p argd['f'] = bsu_f nearestterms.append((bsu_p, bsu_nbhits, argd)) text = websearch_templates.tmpl_search_no_boolean_hits( ln=ln, nearestterms=nearestterms) print_warning(req, text) if verbose and of.startswith("h"): t2 = os.times()[4] print_warning(req, "Search stage 3: boolean query gave %d hits." % len(hitset_in_any_collection)) print_warning(req, "Search stage 3: execution took %.2f seconds." % (t2 - t1)) return hitset_in_any_collection def search_pattern_parenthesised(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0, ln=CFG_SITE_LANG): """Search for complex pattern 'p' containing parenthesis within field 'f' according to matching type 'm'. Return hitset of recIDs. For more details on the parameters see 'search_pattern' """ _ = gettext_set_language(ln) # if the pattern uses SPIRES search syntax, convert it to Invenio syntax spires_syntax_converter = SpiresToInvenioSyntaxConverter() p = spires_syntax_converter.convert_query(p) # sanity check: do not call parenthesised parser for search terms # like U(1): if not re_pattern_parens.search(p): return search_pattern(req, p, f, m, ap, of, verbose, ln) # Try searching with parentheses try: parser = SearchQueryParenthesisedParser() # get a hitset with all recids result_hitset = HitSet(trailing_bits=1) # parse the query. The result is list of [op1, expr1, op2, expr2, ..., opN, exprN] parsing_result = parser.parse_query(p) if verbose and of.startswith("h"): print_warning(req, "Search stage 1: search_pattern_parenthesised() returned %s." % repr(parsing_result)) # go through every pattern # calculate hitset for it # combine pattern's hitset with the result using the corresponding operator for index in xrange(0, len(parsing_result)-1, 2 ): current_operator = parsing_result[index] current_pattern = parsing_result[index+1] # obtain a hitset for the current patter current_hitset = search_pattern(req, current_pattern, f, m, ap, of, verbose, ln) # combine the current hitset with resulting hitset using the current operator if current_operator == '+': result_hitset = result_hitset & current_hitset elif current_operator == '-': result_hitset = result_hitset - current_hitset elif current_operator == '|': result_hitset = result_hitset | current_hitset else: assert False, "Unknown operator in search_pattern_parenthesised()" return result_hitset # If searching with parenteses fails, perform search ignoring parentheses except InvenioWebSearchQueryParserException: print_warning(req, _("Nested or mismatched parentheses detected. Ignoring all parentheses in the query...")) # remove the parentheses in the query. Current implementation removes all the parentheses, # but it could be improved to romove only these that are not insede quotes p = p.replace('(', ' ') p = p.replace(')', ' ') return search_pattern(req, p, f, m, ap, of, verbose, ln) def search_unit(p, f=None, m=None): """Search for basic search unit defined by pattern 'p' and field 'f' and matching type 'm'. Return hitset of recIDs. All the parameters are assumed to have been previously washed. 'p' is assumed to be already a ``basic search unit'' so that it is searched as such and is not broken up in any way. Only wildcard and span queries are being detected inside 'p'. This function is suitable as a low-level API. """ ## create empty output results set: set = HitSet() if not p: # sanity checking return set if m == 'a' or m == 'r': # we are doing either phrase search or regexp search index_id = get_index_id_from_field(f) if index_id != 0: set = search_unit_in_idxphrases(p, f, m) else: set = search_unit_in_bibxxx(p, f, m) elif p.startswith("cited:"): # we are doing search by the citation count set = search_unit_by_times_cited(p[6:]) else: # we are doing bibwords search by default set = search_unit_in_bibwords(p, f) return set def search_unit_in_bibwords(word, f, decompress=zlib.decompress): """Searches for 'word' inside bibwordsX table for field 'f' and returns hitset of recIDs.""" set = HitSet() # will hold output result set set_used = 0 # not-yet-used flag, to be able to circumvent set operations # deduce into which bibwordsX table we will search: stemming_language = get_index_stemming_language(get_index_id_from_field("anyfield")) bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: bibwordsX = "idxWORD%02dF" % index_id stemming_language = get_index_stemming_language(index_id) else: return HitSet() # word index f does not exist # wash 'word' argument and run query: word = string.replace(word, '*', '%') # we now use '*' as the truncation character words = string.split(word, "->", 1) # check for span query if len(words) == 2: word0 = re_word.sub('', words[0]) word1 = re_word.sub('', words[1]) if stemming_language: word0 = lower_index_term(word0) word1 = lower_index_term(word1) word0 = stem(word0, stemming_language) word1 = stem(word1, stemming_language) res = run_sql("SELECT term,hitlist FROM %s WHERE term BETWEEN %%s AND %%s" % bibwordsX, (wash_index_term(word0), wash_index_term(word1))) else: if f == 'journal': pass # FIXME: quick hack for the journal index else: word = re_word.sub('', word) if stemming_language: word = lower_index_term(word) word = stem(word, stemming_language) if string.find(word, '%') >= 0: # do we have wildcard in the word? if f == 'journal': # FIXME: quick hack for the journal index # FIXME: we can run a sanity check here for all indexes res = () else: res = run_sql("SELECT term,hitlist FROM %s WHERE term LIKE %%s" % bibwordsX, (wash_index_term(word),)) else: res = run_sql("SELECT term,hitlist FROM %s WHERE term=%%s" % bibwordsX, (wash_index_term(word),)) # fill the result set: for word, hitlist in res: hitset_bibwrd = HitSet(hitlist) # add the results: if set_used: set.union_update(hitset_bibwrd) else: set = hitset_bibwrd set_used = 1 # okay, return result set: return set def search_unit_in_idxphrases(p, f, type): """Searches for phrase 'p' inside idxPHRASE*F table for field 'f' and returns hitset of recIDs found. The search type is defined by 'type' (e.g. equals to 'r' for a regexp search).""" set = HitSet() # will hold output result set set_used = 0 # not-yet-used flag, to be able to circumvent set operations # deduce in which idxPHRASE table we will search: idxphraseX = "idxPHRASE%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: idxphraseX = "idxPHRASE%02dF" % index_id else: return HitSet() # phrase index f does not exist # detect query type (exact phrase, partial phrase, regexp): if type == 'r': query_addons = "REGEXP %s" query_params = (p,) else: p = string.replace(p, '*', '%') # we now use '*' as the truncation character ps = string.split(p, "->", 1) # check for span query: if len(ps) == 2: query_addons = "BETWEEN %s AND %s" query_params = (ps[0], ps[1]) else: if string.find(p, '%') > -1: query_addons = "LIKE %s" query_params = (ps[0],) else: query_addons = "= %s" query_params = (ps[0],) # perform search: res = run_sql("SELECT term,hitlist FROM %s WHERE term %s" % (idxphraseX, query_addons), query_params) # fill the result set: for word, hitlist in res: hitset_bibphrase = HitSet(hitlist) # add the results: if set_used: set.union_update(hitset_bibphrase) else: set = hitset_bibphrase set_used = 1 # okay, return result set: return set def search_unit_in_bibxxx(p, f, type): """Searches for pattern 'p' inside bibxxx tables for field 'f' and returns hitset of recIDs found. The search type is defined by 'type' (e.g. equals to 'r' for a regexp search).""" # FIXME: quick hack for the journal index if f == 'journal': return search_unit_in_bibwords(p, f) p_orig = p # saving for eventual future 'no match' reporting query_addons = "" # will hold additional SQL code for the query query_params = () # will hold parameters for the query (their number may vary depending on TYPE argument) # wash arguments: f = string.replace(f, '*', '%') # replace truncation char '*' in field definition if type == 'r': query_addons = "REGEXP %s" query_params = (p,) else: p = string.replace(p, '*', '%') # we now use '*' as the truncation character ps = string.split(p, "->", 1) # check for span query: if len(ps) == 2: query_addons = "BETWEEN %s AND %s" query_params = (ps[0], ps[1]) else: if string.find(p, '%') > -1: query_addons = "LIKE %s" query_params = (ps[0],) else: query_addons = "= %s" query_params = (ps[0],) # construct 'tl' which defines the tag list (MARC tags) to search in: tl = [] if str(f[0]).isdigit() and str(f[1]).isdigit(): tl.append(f) # 'f' seems to be okay as it starts by two digits else: # convert old ALEPH tag names, if appropriate: (TODO: get rid of this before entering this function) if CFG_WEBSEARCH_FIELDS_CONVERT.has_key(string.lower(f)): f = CFG_WEBSEARCH_FIELDS_CONVERT[string.lower(f)] # deduce desired MARC tags on the basis of chosen 'f' tl = get_field_tags(f) if not tl: # f index does not exist, nevermind pass # okay, start search: l = [] # will hold list of recID that matched for t in tl: # deduce into which bibxxx table we will search: digit1, digit2 = int(t[0]), int(t[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) # construct and run query: if t == "001": res = run_sql("SELECT id FROM bibrec WHERE id %s" % query_addons, query_params) else: query = "SELECT bibx.id_bibrec FROM %s AS bx LEFT JOIN %s AS bibx ON bx.id=bibx.id_bibxxx WHERE bx.value %s" % \ (bx, bibx, query_addons) if len(t) != 6 or t[-1:]=='%': # wildcard query, or only the beginning of field 't' # is defined, so add wildcard character: query += " AND bx.tag LIKE %s" res = run_sql(query, query_params + (t + '%',)) else: # exact query for 't': query += " AND bx.tag=%s" res = run_sql(query, query_params + (t,)) # fill the result set: for id_bibrec in res: if id_bibrec[0]: l.append(id_bibrec[0]) # check no of hits found: nb_hits = len(l) # okay, return result set: set = HitSet(l) return set def search_unit_in_bibrec(datetext1, datetext2, type='c'): """ Return hitset of recIDs found that were either created or modified (according to 'type' arg being 'c' or 'm') from datetext1 until datetext2, inclusive. Does not pay attention to pattern, collection, anything. Useful to intersect later on with the 'real' query. """ set = HitSet() if type.startswith("m"): type = "modification_date" else: type = "creation_date" # by default we are searching for creation dates res = run_sql("SELECT id FROM bibrec WHERE %s>=%%s AND %s<=%%s" % (type, type), (datetext1, datetext2)) for row in res: set += row[0] return set def search_unit_by_times_cited(p): """ Return histset of recIDs found that are cited P times. Usually P looks like '10->23'. """ numstr = '"'+p+'"' #this is sort of stupid but since we may need to #get the records that do _not_ have cites, we have to #know the ids of all records, too #but this is needed only if bsu_p is 0 or 0 or 0->0 allrecs = [] if p == 0 or p == "0" or \ p.startswith("0->") or p.endswith("->0"): allrecs = HitSet(run_sql_cached("SELECT id FROM bibrec", affected_tables=['bibrec'])) return get_records_with_num_cites(numstr, allrecs) def intersect_results_with_collrecs(req, hitset_in_any_collection, colls, ap=0, of="hb", verbose=0, ln=CFG_SITE_LANG): """Return dict of hitsets given by intersection of hitset with the collection universes.""" _ = gettext_set_language(ln) # search stage 4: intersect with the collection universe: if verbose and of.startswith("h"): t1 = os.times()[4] results = {} results_nbhits = 0 for coll in colls: results[coll] = hitset_in_any_collection & get_collection_reclist(coll) results_nbhits += len(results[coll]) if results_nbhits == 0: # no hits found, try to search in Home: results_in_Home = hitset_in_any_collection & get_collection_reclist(CFG_SITE_NAME) if len(results_in_Home) > 0: # some hits found in Home, so propose this search: if of.startswith("h"): url = websearch_templates.build_search_url(req.argd, cc=CFG_SITE_NAME, c=[]) print_warning(req, _("No match found in collection %(x_collection)s. Other public collections gave %(x_url_open)s%(x_nb_hits)d hits%(x_url_close)s.") %\ {'x_collection': '<em>' + string.join([get_coll_i18nname(coll, ln, False) for coll in colls], ', ') + '</em>', 'x_url_open': '<a class="nearestterms" href="%s">' % (url), 'x_nb_hits': len(results_in_Home), 'x_url_close': '</a>'}) results = {} else: # no hits found in Home, recommend different search terms: if of.startswith("h"): print_warning(req, _("No public collection matched your query. " "If you were looking for a non-public document, please choose " "the desired restricted collection first.")) results = {} if verbose and of.startswith("h"): t2 = os.times()[4] print_warning(req, "Search stage 4: intersecting with collection universe gave %d hits." % results_nbhits) print_warning(req, "Search stage 4: execution took %.2f seconds." % (t2 - t1)) return results def intersect_results_with_hitset(req, results, hitset, ap=0, aptext="", of="hb"): """Return intersection of search 'results' (a dict of hitsets with collection as key) with the 'hitset', i.e. apply 'hitset' intersection to each collection within search 'results'. If the final 'results' set is to be empty, and 'ap' (approximate pattern) is true, and then print the `warningtext' and return the original 'results' set unchanged. If 'ap' is false, then return empty results set. """ if ap: results_ap = copy.deepcopy(results) else: results_ap = {} # will return empty dict in case of no hits found nb_total = 0 for coll in results.keys(): results[coll].intersection_update(hitset) nb_total += len(results[coll]) if nb_total == 0: if of.startswith("h"): print_warning(req, aptext) results = results_ap return results def create_similarly_named_authors_link_box(author_name, ln=CFG_SITE_LANG): """Return a box similar to ``Not satisfied...'' one by proposing author searches for similar names. Namely, take AUTHOR_NAME and the first initial of the firstame (after comma) and look into author index whether authors with e.g. middle names exist. Useful mainly for CERN Library that sometimes contains name forms like Ellis-N, Ellis-Nick, Ellis-Nicolas all denoting the same person. The box isn't proposed if no similarly named authors are found to exist. """ # return nothing if not configured: if CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX == 0: return "" # return empty box if there is no initial: if re.match(r'[^ ,]+, [^ ]', author_name) is None: return "" # firstly find name comma initial: author_name_to_search = re.sub(r'^([^ ,]+, +[^ ,]).*$', '\\1', author_name) # secondly search for similar name forms: similar_author_names = {} for name in author_name_to_search, strip_accents(author_name_to_search): for tag in get_field_tags("author"): # deduce into which bibxxx table we will search: digit1, digit2 = int(tag[0]), int(tag[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) if len(tag) != 6 or tag[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value LIKE %%s AND bx.tag LIKE %%s""" % bx, (name + "%", tag + "%")) else: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value LIKE %%s AND bx.tag=%%s""" % bx, (name + "%", tag)) for row in res: similar_author_names[row[0]] = 1 # remove the original name and sort the list: try: del similar_author_names[author_name] except KeyError: pass # thirdly print the box: out = "" if similar_author_names: out_authors = similar_author_names.keys() out_authors.sort() tmp_authors = [] for out_author in out_authors: nbhits = get_nbhits_in_bibxxx(out_author, "author") if nbhits: tmp_authors.append((out_author, nbhits)) out += websearch_templates.tmpl_similar_author_names( authors=tmp_authors, ln=ln) return out def create_nearest_terms_box(urlargd, p, f, t='w', n=5, ln=CFG_SITE_LANG, intro_text_p=True): """Return text box containing list of 'n' nearest terms above/below 'p' for the field 'f' for matching type 't' (words/phrases) in language 'ln'. Propose new searches according to `urlargs' with the new words. If `intro_text_p' is true, then display the introductory message, otherwise print only the nearest terms in the box content. """ # load the right message language _ = gettext_set_language(ln) out = "" nearest_terms = [] if not p: # sanity check p = "." index_id = get_index_id_from_field(f) # look for nearest terms: if t == 'w': nearest_terms = get_nearest_terms_in_bibwords(p, f, n, n) if not nearest_terms: return _("No word index is available for %s.") % \ ('<em>' + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + '</em>') else: nearest_terms = [] if index_id: nearest_terms = get_nearest_terms_in_idxphrase(p, index_id, n, n) if not nearest_terms: nearest_terms = get_nearest_terms_in_bibxxx(p, f, n, n) if not nearest_terms: return _("No phrase index is available for %s.") % \ ('<em>' + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + '</em>') terminfo = [] for term in nearest_terms: if t == 'w': hits = get_nbhits_in_bibwords(term, f) else: if index_id: hits = get_nbhits_in_idxphrases(term, f) else: hits = get_nbhits_in_bibxxx(term, f) argd = {} argd.update(urlargd) # check which fields contained the requested parameter, and replace it. for (px, fx) in ('p', 'f'), ('p1', 'f1'), ('p2', 'f2'), ('p3', 'f3'): if px in argd: if f == argd[fx] or f == "anyfield" or f == "": if string.find(argd[px], p) > -1: argd[px] = string.replace(argd[px], p, term) break else: if string.find(argd[px], f+':'+p) > -1: argd[px] = string.replace(argd[px], f+':'+p, f+':'+term) break elif string.find(argd[px], f+':"'+p+'"') > -1: argd[px] = string.replace(argd[px], f+':"'+p+'"', f+':"'+term+'"') break terminfo.append((term, hits, argd)) intro = "" if intro_text_p: # add full leading introductory text if f: intro = _("Search term %(x_term)s inside index %(x_index)s did not match any record. Nearest terms in any collection are:") % \ {'x_term': "<em>" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "</em>", 'x_index': "<em>" + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + "</em>"} else: intro = _("Search term %s did not match any record. Nearest terms in any collection are:") % \ ("<em>" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "</em>") return websearch_templates.tmpl_nearest_term_box(p=p, ln=ln, f=f, terminfo=terminfo, intro=intro) def get_nearest_terms_in_bibwords(p, f, n_below, n_above): """Return list of +n -n nearest terms to word `p' in index for field `f'.""" nearest_words = [] # will hold the (sorted) list of nearest words to return # deduce into which bibwordsX table we will search: bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: bibwordsX = "idxWORD%02dF" % index_id else: return nearest_words # firstly try to get `n' closest words above `p': res = run_sql("SELECT term FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % bibwordsX, (p, n_above)) for row in res: nearest_words.append(row[0]) nearest_words.reverse() # secondly insert given word `p': nearest_words.append(p) # finally try to get `n' closest words below `p': res = run_sql("SELECT term FROM %s WHERE term>%%s ORDER BY term ASC LIMIT %%s" % bibwordsX, (p, n_below)) for row in res: nearest_words.append(row[0]) return nearest_words def get_nearest_terms_in_idxphrase(p, index_id, n_below, n_above): """Browse (-n_above, +n_below) closest bibliographic phrases for the given pattern p in the given field idxPHRASE table, regardless of collection. Return list of [phrase1, phrase2, ... , phrase_n].""" idxphraseX = "idxPHRASE%02dF" % index_id res_above = run_sql("SELECT term FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % idxphraseX, (p, n_above)) res_above = map(lambda x: x[0], res_above) res_above.reverse() res_below = run_sql("SELECT term FROM %s WHERE term>=%%s ORDER BY term ASC LIMIT %%s" % idxphraseX, (p, n_below)) res_below = map(lambda x: x[0], res_below) return res_above + res_below def get_nearest_terms_in_idxphrase_with_collection(p, index_id, n_below, n_above, collection): """Browse (-n_above, +n_below) closest bibliographic phrases for the given pattern p in the given field idxPHRASE table, considering the collection (HitSet). Return list of [(phrase1, hitset), (phrase2, hitset), ... , (phrase_n, hitset)].""" idxphraseX = "idxPHRASE%02dF" % index_id res_above = run_sql("SELECT term,hitlist FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % idxphraseX, (p, n_above * 3)) res_above = [(term, HitSet(hitlist) & collection) for term, hitlist in res_above] res_above = [(term, len(hitlist)) for term, hitlist in res_above if hitlist] res_below = run_sql("SELECT term,hitlist FROM %s WHERE term>=%%s ORDER BY term ASC LIMIT %%s" % idxphraseX, (p, n_below * 3)) res_below = [(term, HitSet(hitlist) & collection) for term, hitlist in res_below] res_below = [(term, len(hitlist)) for term, hitlist in res_below if hitlist] res_above.reverse() return res_above[-n_above:] + res_below[:n_below] def get_nearest_terms_in_bibxxx(p, f, n_below, n_above): """Browse (-n_above, +n_below) closest bibliographic phrases for the given pattern p in the given field f, regardless of collection. Return list of [phrase1, phrase2, ... , phrase_n].""" ## determine browse field: if not f and string.find(p, ":") > 0: # does 'p' contain ':'? f, p = string.split(p, ":", 1) # FIXME: quick hack for the journal index if f == 'journal': return get_nearest_terms_in_bibwords(p, f, n_below, n_above) ## We are going to take max(n_below, n_above) as the number of ## values to ferch from bibXXx. This is needed to work around ## MySQL UTF-8 sorting troubles in 4.0.x. Proper solution is to ## use MySQL 4.1.x or our own idxPHRASE in the future. index_id = get_index_id_from_field(f) if index_id: return get_nearest_terms_in_idxphrase(p, index_id, n_below, n_above) n_fetch = 2*max(n_below, n_above) ## construct 'tl' which defines the tag list (MARC tags) to search in: tl = [] if str(f[0]).isdigit() and str(f[1]).isdigit(): tl.append(f) # 'f' seems to be okay as it starts by two digits else: # deduce desired MARC tags on the basis of chosen 'f' tl = get_field_tags(f) ## start browsing to fetch list of hits: browsed_phrases = {} # will hold {phrase1: 1, phrase2: 1, ..., phraseN: 1} dict of browsed phrases (to make them unique) # always add self to the results set: browsed_phrases[p.startswith("%") and p.endswith("%") and p[1:-1] or p] = 1 for t in tl: # deduce into which bibxxx table we will search: digit1, digit2 = int(t[0]), int(t[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) # firstly try to get `n' closest phrases above `p': if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value<%%s AND bx.tag LIKE %%s ORDER BY bx.value DESC LIMIT %%s""" % bx, (p, t + "%", n_fetch)) else: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value<%%s AND bx.tag=%%s ORDER BY bx.value DESC LIMIT %%s""" % bx, (p, t, n_fetch)) for row in res: browsed_phrases[row[0]] = 1 # secondly try to get `n' closest phrases equal to or below `p': if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value>=%%s AND bx.tag LIKE %%s ORDER BY bx.value ASC LIMIT %%s""" % bx, (p, t + "%", n_fetch)) else: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value>=%%s AND bx.tag=%%s ORDER BY bx.value ASC LIMIT %%s""" % bx, (p, t, n_fetch)) for row in res: browsed_phrases[row[0]] = 1 # select first n words only: (this is needed as we were searching # in many different tables and so aren't sure we have more than n # words right; this of course won't be needed when we shall have # one ACC table only for given field): phrases_out = browsed_phrases.keys() phrases_out.sort(lambda x, y: cmp(string.lower(strip_accents(x)), string.lower(strip_accents(y)))) # find position of self: try: idx_p = phrases_out.index(p) except: idx_p = len(phrases_out)/2 # return n_above and n_below: return phrases_out[max(0, idx_p-n_above):idx_p+n_below] def get_nbhits_in_bibwords(word, f): """Return number of hits for word 'word' inside words index for field 'f'.""" out = 0 # deduce into which bibwordsX table we will search: bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: bibwordsX = "idxWORD%02dF" % index_id else: return 0 if word: res = run_sql("SELECT hitlist FROM %s WHERE term=%%s" % bibwordsX, (word,)) for hitlist in res: out += len(HitSet(hitlist[0])) return out def get_nbhits_in_idxphrases(word, f): """Return number of hits for word 'word' inside phrase index for field 'f'.""" out = 0 # deduce into which bibwordsX table we will search: idxphraseX = "idxPHRASE%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: idxphraseX = "idxPHRASE%02dF" % index_id else: return 0 if word: res = run_sql("SELECT hitlist FROM %s WHERE term=%%s" % idxphraseX, (word,)) for hitlist in res: out += len(HitSet(hitlist[0])) return out def get_nbhits_in_bibxxx(p, f): """Return number of hits for word 'word' inside words index for field 'f'.""" ## determine browse field: if not f and string.find(p, ":") > 0: # does 'p' contain ':'? f, p = string.split(p, ":", 1) # FIXME: quick hack for the journal index if f == 'journal': return get_nbhits_in_bibwords(p, f) ## construct 'tl' which defines the tag list (MARC tags) to search in: tl = [] if str(f[0]).isdigit() and str(f[1]).isdigit(): tl.append(f) # 'f' seems to be okay as it starts by two digits else: # deduce desired MARC tags on the basis of chosen 'f' tl = get_field_tags(f) # start searching: recIDs = {} # will hold dict of {recID1: 1, recID2: 1, ..., } (unique recIDs, therefore) for t in tl: # deduce into which bibxxx table we will search: digit1, digit2 = int(t[0]), int(t[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bibx.id_bibrec FROM %s AS bibx, %s AS bx WHERE bx.value=%%s AND bx.tag LIKE %%s AND bibx.id_bibxxx=bx.id""" % (bibx, bx), (p, t + "%")) else: res = run_sql("""SELECT bibx.id_bibrec FROM %s AS bibx, %s AS bx WHERE bx.value=%%s AND bx.tag=%%s AND bibx.id_bibxxx=bx.id""" % (bibx, bx), (p, t)) for row in res: recIDs[row[0]] = 1 return len(recIDs) def get_mysql_recid_from_aleph_sysno(sysno): """Returns DB's recID for ALEPH sysno passed in the argument (e.g. "002379334CER"). Returns None in case of failure.""" out = None res = run_sql("""SELECT bb.id_bibrec FROM bibrec_bib97x AS bb, bib97x AS b WHERE b.value=%s AND b.tag='970__a' AND bb.id_bibxxx=b.id""", (sysno,)) if res: out = res[0][0] return out def guess_primary_collection_of_a_record(recID): """Return primary collection name a record recid belongs to, by testing 980 identifier. May lead to bad guesses when a collection is defined dynamically via dbquery. In that case, return 'CFG_SITE_NAME'.""" out = CFG_SITE_NAME dbcollids = get_fieldvalues(recID, "980__a") if dbcollids: dbquery = "collection:" + dbcollids[0] res = run_sql("SELECT name FROM collection WHERE dbquery=%s", (dbquery,)) if res: out = res[0][0] return out _re_collection_url = re.compile('/collection/(.+)') def guess_collection_of_a_record(recID, referer=None): """Return collection name a record recid belongs to, by first testing the referer URL if provided and otherwise returning the primary collection.""" if referer: dummy, hostname, path, dummy, query, dummy = urlparse.urlparse(referer) g = _re_collection_url.match(path) if g: name = urllib.unquote_plus(g.group(1)) if recID in get_collection_reclist(name): return name elif path.startswith('/search'): query = cgi.parse_qs(query) for name in query.get('cc', []) + query.get('c', []): if recID in get_collection_reclist(name): return name return guess_primary_collection_of_a_record(recID) def get_all_collections_of_a_record(recID): """Return all the collection names a record belongs to. Note this function is O(n_collections).""" ret = [] for name in collection_reclist_cache.cache.keys(): if recID in get_collection_reclist(name): ret.append(name) return ret def get_tag_name(tag_value, prolog="", epilog=""): """Return tag name from the known tag value, by looking up the 'tag' table. Return empty string in case of failure. Example: input='100__%', output=first author'.""" out = "" res = run_sql_cached("SELECT name FROM tag WHERE value=%s", (tag_value,), affected_tables=['tag',]) if res: out = prolog + res[0][0] + epilog return out def get_fieldcodes(): """Returns a list of field codes that may have been passed as 'search options' in URL. Example: output=['subject','division'].""" out = [] res = run_sql_cached("SELECT DISTINCT(code) FROM field", affected_tables=['field',]) for row in res: out.append(row[0]) return out def get_field_name(code): """Return the corresponding field_name given the field code. e.g. reportnumber -> report number.""" res = run_sql_cached("SELECT name FROM field WHERE code=%s", (code, ), affected_tables=['field',]) if res: return res[0][0] else: return "" def get_field_tags(field): """Returns a list of MARC tags for the field code 'field'. Returns empty list in case of error. Example: field='author', output=['100__%','700__%'].""" out = [] query = """SELECT t.value FROM tag AS t, field_tag AS ft, field AS f WHERE f.code=%s AND ft.id_field=f.id AND t.id=ft.id_tag ORDER BY ft.score DESC""" res = run_sql(query, (field, )) for val in res: out.append(val[0]) return out def get_fieldvalues(recIDs, tag, repetitive_values=True): """ Return list of field values for field TAG for the given record ID or list of record IDs. (RECIDS can be both an integer or a list of integers.) If REPETITIVE_VALUES is set to True, then return all values even if they are doubled. If set to False, then return unique values only. """ out = [] if isinstance(recIDs, (int, long)): recIDs =[recIDs,] if not isinstance(recIDs, (list, tuple)): return [] if len(recIDs) == 0: return [] if tag == "001___": # we have asked for tag 001 (=recID) that is not stored in bibXXx tables out = [str(recID) for recID in recIDs] else: # we are going to look inside bibXXx tables digits = tag[0:2] try: intdigits = int(digits) if intdigits < 0 or intdigits > 99: raise ValueError except ValueError: # invalid tag value asked for return [] bx = "bib%sx" % digits bibx = "bibrec_bib%sx" % digits queryparam = [] for recID in recIDs: queryparam.append(recID) if not repetitive_values: queryselect = "DISTINCT(bx.value)" else: queryselect = "bx.value" query = "SELECT %s FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec IN (%s) " \ " AND bx.id=bibx.id_bibxxx AND bx.tag LIKE %%s " \ " ORDER BY bibx.field_number, bx.tag ASC" % \ (queryselect, bx, bibx, ("%s,"*len(queryparam))[:-1]) res = run_sql(query, tuple(queryparam) + (tag,)) for row in res: out.append(row[0]) return out def get_fieldvalues_alephseq_like(recID, tags_in): """Return buffer of ALEPH sequential-like textual format with fields found in the list TAGS_IN for record RECID.""" out = "" if type(tags_in) is not list: tags_in = [tags_in,] if len(tags_in) == 1 and len(tags_in[0]) == 6: ## case A: one concrete subfield asked, so print its value if found ## (use with care: can false you if field has multiple occurrences) out += string.join(get_fieldvalues(recID, tags_in[0]),"\n") else: ## case B: print our "text MARC" format; works safely all the time # find out which tags to output: dict_of_tags_out = {} if not tags_in: for i in range(0, 10): for j in range(0, 10): dict_of_tags_out["%d%d%%" % (i, j)] = 1 else: for tag in tags_in: if len(tag) == 0: for i in range(0, 10): for j in range(0, 10): dict_of_tags_out["%d%d%%" % (i, j)] = 1 elif len(tag) == 1: for j in range(0, 10): dict_of_tags_out["%s%d%%" % (tag, j)] = 1 elif len(tag) < 5: dict_of_tags_out["%s%%" % tag] = 1 elif tag >= 6: dict_of_tags_out[tag[0:5]] = 1 tags_out = dict_of_tags_out.keys() tags_out.sort() # search all bibXXx tables as needed: for tag in tags_out: digits = tag[0:2] try: intdigits = int(digits) if intdigits < 0 or intdigits > 99: raise ValueError except ValueError: # invalid tag value asked for continue if tag.startswith("001") or tag.startswith("00%"): if out: out += "\n" out += "%09d %s %d" % (recID, "001__", recID) bx = "bib%sx" % digits bibx = "bibrec_bib%sx" % digits query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s"\ "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx) res = run_sql(query, (recID, str(tag)+'%')) # go through fields: field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] if ind1 == "_": ind1 = "" if ind2 == "_": ind2 = "" # print field tag if field_number != field_number_old or field[:-1] != field_old[:-1]: if out: out += "\n" out += "%09d %s " % (recID, field[:5]) field_number_old = field_number field_old = field # print subfield value if field[0:2] == "00" and field[-1:] == "_": out += value else: out += "$$%s%s" % (field[-1:], value) return out def record_exists(recID): """Return 1 if record RECID exists. Return 0 if it doesn't exist. Return -1 if it exists but is marked as deleted.""" out = 0 res = run_sql("SELECT id FROM bibrec WHERE id=%s", (recID,), 1) if res: # record exists; now check whether it isn't marked as deleted: dbcollids = get_fieldvalues(recID, "980__%") if ("DELETED" in dbcollids) or (CFG_CERN_SITE and "DUMMY" in dbcollids): out = -1 # exists, but marked as deleted else: out = 1 # exists fine return out def record_public_p(recID): """Return 1 if the record is public, i.e. if it can be found in the Home collection. Return 0 otherwise. """ return recID in get_collection_reclist(CFG_SITE_NAME) def get_creation_date(recID, fmt="%Y-%m-%d"): "Returns the creation date of the record 'recID'." out = "" res = run_sql("SELECT DATE_FORMAT(creation_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out def get_modification_date(recID, fmt="%Y-%m-%d"): "Returns the date of last modification for the record 'recID'." out = "" res = run_sql("SELECT DATE_FORMAT(modification_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out def print_warning(req, msg, type='', prologue='<br />', epilogue='<br />'): "Prints warning message and flushes output." if req and msg: req.write(websearch_templates.tmpl_print_warning( msg = msg, type = type, prologue = prologue, epilogue = epilogue, )) return def print_search_info(p, f, sf, so, sp, rm, of, ot, collection=CFG_SITE_NAME, nb_found=-1, jrec=1, rg=10, - as=0, ln=CFG_SITE_LANG, p1="", p2="", p3="", f1="", f2="", f3="", m1="", m2="", m3="", op1="", op2="", + aas=0, ln=CFG_SITE_LANG, p1="", p2="", p3="", f1="", f2="", f3="", m1="", m2="", m3="", op1="", op2="", sc=1, pl_in_url="", d1y=0, d1m=0, d1d=0, d2y=0, d2m=0, d2d=0, dt="", cpu_time=-1, middle_only=0): """Prints stripe with the information on 'collection' and 'nb_found' results and CPU time. Also, prints navigation links (beg/next/prev/end) inside the results set. If middle_only is set to 1, it will only print the middle box information (beg/netx/prev/end/etc) links. This is suitable for displaying navigation links at the bottom of the search results page.""" out = "" # sanity check: if jrec < 1: jrec = 1 if jrec > nb_found: jrec = max(nb_found-rg+1, 1) return websearch_templates.tmpl_print_search_info( ln = ln, collection = collection, - as = as, + aas = aas, collection_name = get_coll_i18nname(collection, ln, False), collection_id = get_colID(collection), middle_only = middle_only, rg = rg, nb_found = nb_found, sf = sf, so = so, rm = rm, of = of, ot = ot, p = p, f = f, p1 = p1, p2 = p2, p3 = p3, f1 = f1, f2 = f2, f3 = f3, m1 = m1, m2 = m2, m3 = m3, op1 = op1, op2 = op2, pl_in_url = pl_in_url, d1y = d1y, d1m = d1m, d1d = d1d, d2y = d2y, d2m = d2m, d2d = d2d, dt = dt, jrec = jrec, sc = sc, sp = sp, all_fieldcodes = get_fieldcodes(), cpu_time = cpu_time, ) def print_results_overview(req, colls, results_final_nb_total, results_final_nb, cpu_time, ln=CFG_SITE_LANG, ec=[]): """Prints results overview box with links to particular collections below.""" out = "" new_colls = [] for coll in colls: new_colls.append({ 'id': get_colID(coll), 'code': coll, 'name': get_coll_i18nname(coll, ln, False), }) return websearch_templates.tmpl_print_results_overview( ln = ln, results_final_nb_total = results_final_nb_total, results_final_nb = results_final_nb, cpu_time = cpu_time, colls = new_colls, ec = ec, ) def sort_records(req, recIDs, sort_field='', sort_order='d', sort_pattern='', verbose=0, of='hb', ln=CFG_SITE_LANG): """Sort records in 'recIDs' list according sort field 'sort_field' in order 'sort_order'. If more than one instance of 'sort_field' is found for a given record, try to choose that that is given by 'sort pattern', for example "sort by report number that starts by CERN-PS". Note that 'sort_field' can be field code like 'author' or MARC tag like '100__a' directly.""" _ = gettext_set_language(ln) ## check arguments: if not sort_field: return recIDs if len(recIDs) > CFG_WEBSEARCH_NB_RECORDS_TO_SORT: if of.startswith('h'): print_warning(req, _("Sorry, sorting is allowed on sets of up to %d records only. Using default sort order.") % CFG_WEBSEARCH_NB_RECORDS_TO_SORT, "Warning") return recIDs sort_fields = string.split(sort_field, ",") recIDs_dict = {} recIDs_out = [] ## first deduce sorting MARC tag out of the 'sort_field' argument: tags = [] for sort_field in sort_fields: if sort_field and str(sort_field[0:2]).isdigit(): # sort_field starts by two digits, so this is probably a MARC tag already tags.append(sort_field) else: # let us check the 'field' table query = """SELECT DISTINCT(t.value) FROM tag AS t, field_tag AS ft, field AS f WHERE f.code=%s AND ft.id_field=f.id AND t.id=ft.id_tag ORDER BY ft.score DESC""" res = run_sql(query, (sort_field, )) if res: for row in res: tags.append(row[0]) else: if of.startswith('h'): print_warning(req, _("Sorry, %s does not seem to be a valid sort option. Choosing title sort instead.") % cgi.escape(sort_field), "Error") tags.append("245__a") if verbose >= 3: print_warning(req, "Sorting by tags %s." % cgi.escape(repr(tags))) if sort_pattern: print_warning(req, "Sorting preferentially by %s." % cgi.escape(sort_pattern)) ## check if we have sorting tag defined: if tags: # fetch the necessary field values: for recID in recIDs: val = "" # will hold value for recID according to which sort vals = [] # will hold all values found in sorting tag for recID for tag in tags: vals.extend(get_fieldvalues(recID, tag)) if sort_pattern: # try to pick that tag value that corresponds to sort pattern bingo = 0 for v in vals: if v.lower().startswith(sort_pattern.lower()): # bingo! bingo = 1 val = v break if not bingo: # sort_pattern not present, so add other vals after spaces val = sort_pattern + " " + string.join(vals) else: # no sort pattern defined, so join them all together val = string.join(vals) val = strip_accents(val.lower()) # sort values regardless of accents and case if recIDs_dict.has_key(val): recIDs_dict[val].append(recID) else: recIDs_dict[val] = [recID] # sort them: recIDs_dict_keys = recIDs_dict.keys() recIDs_dict_keys.sort() # now that keys are sorted, create output array: for k in recIDs_dict_keys: for s in recIDs_dict[k]: recIDs_out.append(s) # ascending or descending? if sort_order == 'a': recIDs_out.reverse() # okay, we are done return recIDs_out else: # good, no sort needed return recIDs def print_records(req, recIDs, jrec=1, rg=10, format='hb', ot='', ln=CFG_SITE_LANG, relevances=[], relevances_prologue="(", relevances_epilogue="%%)", decompress=zlib.decompress, search_pattern='', print_records_prologue_p=True, print_records_epilogue_p=True, verbose=0, tab=''): """ Prints list of records 'recIDs' formatted according to 'format' in groups of 'rg' starting from 'jrec'. Assumes that the input list 'recIDs' is sorted in reverse order, so it counts records from tail to head. A value of 'rg=-9999' means to print all records: to be used with care. Print also list of RELEVANCES for each record (if defined), in between RELEVANCE_PROLOGUE and RELEVANCE_EPILOGUE. Print prologue and/or epilogue specific to 'format' if 'print_records_prologue_p' and/or print_records_epilogue_p' are True. """ # load the right message language _ = gettext_set_language(ln) # sanity checking: if req is None: return # get user_info (for formatting based on user) user_info = collect_user_info(req) if len(recIDs): nb_found = len(recIDs) if rg == -9999: # print all records rg = nb_found else: rg = abs(rg) if jrec < 1: # sanity checks jrec = 1 if jrec > nb_found: jrec = max(nb_found-rg+1, 1) # will print records from irec_max to irec_min excluded: irec_max = nb_found - jrec irec_min = nb_found - jrec - rg if irec_min < 0: irec_min = -1 if irec_max >= nb_found: irec_max = nb_found - 1 #req.write("%s:%d-%d" % (recIDs, irec_min, irec_max)) if format.startswith('x'): # print header if needed if print_records_prologue_p: print_records_prologue(req, format) # print records recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)] format_records(recIDs_to_print, format, ln=ln, search_pattern=search_pattern, record_separator="\n", user_info=user_info, req=req) # print footer if needed if print_records_epilogue_p: print_records_epilogue(req, format) elif format.startswith('t') or str(format[0:3]).isdigit(): # we are doing plain text output: for irec in range(irec_max, irec_min, -1): x = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) req.write(x) if x: req.write('\n') elif format == 'excel': recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)] create_excel(recIDs=recIDs_to_print, req=req, ln=ln, ot=ot) else: # we are doing HTML output: if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"): # portfolio and on-the-fly formats: for irec in range(irec_max, irec_min, -1): req.write(print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose)) elif format.startswith("hb"): # HTML brief format: req.write(websearch_templates.tmpl_record_format_htmlbrief_header( ln = ln)) for irec in range(irec_max, irec_min, -1): row_number = jrec+irec_max-irec recid = recIDs[irec] if relevances and relevances[irec]: relevance = relevances[irec] else: relevance = '' record = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) req.write(websearch_templates.tmpl_record_format_htmlbrief_body( ln = ln, recid = recid, row_number = row_number, relevance = relevance, record = record, relevances_prologue = relevances_prologue, relevances_epilogue = relevances_epilogue, )) req.write(websearch_templates.tmpl_record_format_htmlbrief_footer( ln = ln)) elif format.startswith("hd"): # HTML detailed format: for irec in range(irec_max, irec_min, -1): unordered_tabs = get_detailed_page_tabs(get_colID(guess_primary_collection_of_a_record(recIDs[irec])), recIDs[irec], ln=ln) ordered_tabs_id = [(tab_id, values['order']) for (tab_id, values) in unordered_tabs.iteritems()] ordered_tabs_id.sort(lambda x,y: cmp(x[1],y[1])) link_ln = '' if ln != CFG_SITE_LANG: link_ln = '?ln=%s' % ln if CFG_WEBSEARCH_USE_ALEPH_SYSNOS: recid_to_display = get_fieldvalues(recIDs[irec], CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG)[0] else: recid_to_display = recIDs[irec] tabs = [(unordered_tabs[tab_id]['label'], \ '%s/record/%s/%s%s' % (CFG_SITE_URL, recid_to_display, tab_id, link_ln), \ tab_id == tab, unordered_tabs[tab_id]['enabled']) \ for (tab_id, order) in ordered_tabs_id if unordered_tabs[tab_id]['visible'] == True] content = '' # load content if tab == 'usage': req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln)) r = calculate_reading_similarity_list(recIDs[irec], "downloads") downloadsimilarity = None downloadhistory = None #if r: # downloadsimilarity = r if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS: downloadhistory = create_download_history_graph_and_box(recIDs[irec], ln) r = calculate_reading_similarity_list(recIDs[irec], "pageviews") viewsimilarity = None if r: viewsimilarity = r content = websearch_templates.tmpl_detailed_record_statistics(recIDs[irec], ln, downloadsimilarity=downloadsimilarity, downloadhistory=downloadhistory, viewsimilarity=viewsimilarity) req.write(content) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln)) elif tab == 'citations': recid = recIDs[irec] req.write(webstyle_templates.detailed_record_container_top(recid, tabs, ln)) req.write(websearch_templates.tmpl_detailed_record_citations_prologue(recid, ln)) # Citing citinglist = [] r = calculate_cited_by_list(recid) if r: citinglist = r req.write(websearch_templates.tmpl_detailed_record_citations_citing_list(recid, ln, citinglist=citinglist)) # Self-cited selfcited = get_self_cited_by(recid) req.write(websearch_templates.tmpl_detailed_record_citations_self_cited(recid, ln, selfcited=selfcited, citinglist=citinglist)) # Co-cited s = calculate_co_cited_with_list(recid) cociting = None if s: cociting = s req.write(websearch_templates.tmpl_detailed_record_citations_co_citing(recid, ln, cociting=cociting)) # Citation history citationhistory = None if r: citationhistory = create_citation_history_graph_and_box(recid, ln) #debug if verbose > 3: print_warning(req, "Citation graph debug: "+str(len(citationhistory))) req.write(websearch_templates.tmpl_detailed_record_citations_citation_history(recid, ln, citationhistory)) req.write(websearch_templates.tmpl_detailed_record_citations_epilogue(recid, ln)) req.write(webstyle_templates.detailed_record_container_bottom(recid, tabs, ln)) elif tab == 'references': req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln)) req.write(format_record(recIDs[irec], 'HDREF', ln=ln, user_info=user_info, verbose=verbose)) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln)) elif tab == 'holdings': req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln)) req.write(format_record(recIDs[irec], 'HDHOLD', ln=ln, user_info=user_info, verbose=verbose)) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln)) else: # Metadata tab req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln, show_short_rec_p=False)) creationdate = None modificationdate = None if record_exists(recIDs[irec]) == 1: creationdate = get_creation_date(recIDs[irec]) modificationdate = get_modification_date(recIDs[irec]) content = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) content = websearch_templates.tmpl_detailed_record_metadata( recID = recIDs[irec], ln = ln, format = format, creationdate = creationdate, modificationdate = modificationdate, content = content) req.write(content) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln, creationdate=creationdate, modificationdate=modificationdate, show_short_rec_p=False)) if len(tabs) > 0: # Add the mini box at bottom of the page if CFG_WEBCOMMENT_ALLOW_REVIEWS: from invenio.webcomment import get_mini_reviews reviews = get_mini_reviews(recid = recIDs[irec], ln=ln) else: reviews = '' actions = format_record(recIDs[irec], 'HDACT', ln=ln, user_info=user_info, verbose=verbose) files = format_record(recIDs[irec], 'HDFILE', ln=ln, user_info=user_info, verbose=verbose) req.write(webstyle_templates.detailed_record_mini_panel(recIDs[irec], ln, format, files=files, reviews=reviews, actions=actions)) else: # Other formats for irec in range(irec_max, irec_min, -1): req.write(print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose)) else: print_warning(req, _("Use different search terms.")) def print_records_prologue(req, format): """ Print the appropriate prologue for list of records in the given format. """ prologue = "" # no prologue needed for HTML or Text formats if format.startswith('xm'): prologue = websearch_templates.tmpl_xml_marc_prologue() elif format.startswith('xn'): prologue = websearch_templates.tmpl_xml_nlm_prologue() elif format.startswith('xw'): prologue = websearch_templates.tmpl_xml_refworks_prologue() elif format.startswith('xr'): prologue = websearch_templates.tmpl_xml_rss_prologue() elif format.startswith('xe'): prologue = websearch_templates.tmpl_xml_endnote_prologue() elif format.startswith('xo'): prologue = websearch_templates.tmpl_xml_mods_prologue() elif format.startswith('x'): prologue = websearch_templates.tmpl_xml_default_prologue() req.write(prologue) def print_records_epilogue(req, format): """ Print the appropriate epilogue for list of records in the given format. """ epilogue = "" # no epilogue needed for HTML or Text formats if format.startswith('xm'): epilogue = websearch_templates.tmpl_xml_marc_epilogue() elif format.startswith('xn'): epilogue = websearch_templates.tmpl_xml_nlm_epilogue() elif format.startswith('xw'): epilogue = websearch_templates.tmpl_xml_refworks_epilogue() elif format.startswith('xr'): epilogue = websearch_templates.tmpl_xml_rss_epilogue() elif format.startswith('xe'): epilogue = websearch_templates.tmpl_xml_endnote_epilogue() elif format.startswith('xo'): epilogue = websearch_templates.tmpl_xml_mods_epilogue() elif format.startswith('x'): epilogue = websearch_templates.tmpl_xml_default_epilogue() req.write(epilogue) def get_record(recid): """Directly the record object corresponding to the recid.""" from marshal import loads, dumps from zlib import compress, decompress if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE: value = run_sql('SELECT value FROM bibfmt WHERE id_bibrec=%s AND FORMAT=\'recstruct\'', (recid, )) if value: try: return loads(decompress(value[0][0])) except: ### In case of corruption, let's rebuild it! pass return create_record(print_record(recid, 'xm'))[0] def print_record(recID, format='hb', ot='', ln=CFG_SITE_LANG, decompress=zlib.decompress, search_pattern=None, user_info=None, verbose=0): """Prints record 'recID' formatted accoding to 'format'.""" if format == 'recstruct': return get_record(recID) _ = gettext_set_language(ln) out = "" # sanity check: record_exist_p = record_exists(recID) if record_exist_p == 0: # doesn't exist return out # New Python BibFormat procedure for formatting # Old procedure follows further below # We must still check some special formats, but these # should disappear when BibFormat improves. if not (CFG_BIBFORMAT_USE_OLD_BIBFORMAT \ or format.lower().startswith('t') \ or format.lower().startswith('hm') \ or str(format[0:3]).isdigit() \ or ot): # Unspecified format is hd if format == '': format = 'hd' if record_exist_p == -1 and get_output_format_content_type(format) == 'text/html': # HTML output displays a default value for deleted records. # Other format have to deal with it. out += _("The record has been deleted.") else: out += call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) # at the end of HTML brief mode, print the "Detailed record" functionality: if format.lower().startswith('hb') and \ format.lower() != 'hb_p': out += websearch_templates.tmpl_print_record_brief_links( ln = ln, recID = recID, ) return out # Old PHP BibFormat procedure for formatting # print record opening tags, if needed: if format == "marcxml" or format == "oai_dc": out += " <record>\n" out += " <header>\n" for oai_id in get_fieldvalues(recID, CFG_OAI_ID_FIELD): out += " <identifier>%s</identifier>\n" % oai_id out += " <datestamp>%s</datestamp>\n" % get_modification_date(recID) out += " </header>\n" out += " <metadata>\n" if format.startswith("xm") or format == "marcxml": # look for detailed format existence: query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s" res = run_sql(query, (recID, format), 1) if res and record_exist_p == 1: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format' -- they are not in "bibfmt" table; so fetch all the data from "bibXXx" tables: if format == "marcxml": out += """ <record xmlns="http://www.loc.gov/MARC21/slim">\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int(recID) elif format.startswith("xm"): out += """ <record>\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int(recID) if record_exist_p == -1: # deleted record, so display only OAI ID and 980: oai_ids = get_fieldvalues(recID, CFG_OAI_ID_FIELD) if oai_ids: out += "<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\"><subfield code=\"%s\">%s</subfield></datafield>\n" % \ (CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4], CFG_OAI_ID_FIELD[4:5], CFG_OAI_ID_FIELD[5:6], oai_ids[0]) out += "<datafield tag=\"980\" ind1=\"\" ind2=\"\"><subfield code=\"c\">DELETED</subfield></datafield>\n" else: # controlfields query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\ "WHERE bb.id_bibrec=%s AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\ "ORDER BY bb.field_number, b.tag ASC" res = run_sql(query, (recID, )) for row in res: field, value = row[0], row[1] value = encode_for_xml(value) out += """ <controlfield tag="%s" >%s</controlfield>\n""" % \ (encode_for_xml(field[0:3]), value) # datafields i = 1 # Do not process bib00x and bibrec_bib00x, as # they are controlfields. So start at bib01x and # bibrec_bib00x (and set i = 0 at the end of # first loop) for digit1 in range(0, 10): for digit2 in range(i, 10): bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s"\ "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx) res = run_sql(query, (recID, str(digit1)+str(digit2)+'%')) field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] if ind1 == "_" or ind1 == "": ind1 = " " if ind2 == "_" or ind2 == "": ind2 = " " # print field tag if field_number != field_number_old or field[:-1] != field_old[:-1]: if field_number_old != -999: out += """ </datafield>\n""" out += """ <datafield tag="%s" ind1="%s" ind2="%s">\n""" % \ (encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2)) field_number_old = field_number field_old = field # print subfield value value = encode_for_xml(value) out += """ <subfield code="%s">%s</subfield>\n""" % \ (encode_for_xml(field[-1:]), value) # all fields/subfields printed in this run, so close the tag: if field_number_old != -999: out += """ </datafield>\n""" i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x # we are at the end of printing the record: out += " </record>\n" elif format == "xd" or format == "oai_dc": # XML Dublin Core format, possibly OAI -- select only some bibXXx fields: out += """ <dc xmlns="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://www.openarchives.org/OAI/1.1/dc.xsd">\n""" if record_exist_p == -1: out += "" else: for f in get_fieldvalues(recID, "041__a"): out += " <language>%s</language>\n" % f for f in get_fieldvalues(recID, "100__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "700__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "245__a"): out += " <title>%s</title>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "65017a"): out += " <subject>%s</subject>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "8564_u"): out += " <identifier>%s</identifier>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "520__a"): out += " <description>%s</description>\n" % encode_for_xml(f) out += " <date>%s</date>\n" % get_creation_date(recID) out += " </dc>\n" elif len(format) == 6 and str(format[0:3]).isdigit(): # user has asked to print some fields only if format == "001": out += "<!--%s-begin-->%s<!--%s-end-->\n" % (format, recID, format) else: vals = get_fieldvalues(recID, format) for val in vals: out += "<!--%s-begin-->%s<!--%s-end-->\n" % (format, val, format) elif format.startswith('t'): ## user directly asked for some tags to be displayed only if record_exist_p == -1: out += get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"]) else: out += get_fieldvalues_alephseq_like(recID, ot) elif format == "hm": if record_exist_p == -1: out += "<pre>" + cgi.escape(get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"])) + "</pre>" else: out += "<pre>" + cgi.escape(get_fieldvalues_alephseq_like(recID, ot)) + "</pre>" elif format.startswith("h") and ot: ## user directly asked for some tags to be displayed only if record_exist_p == -1: out += "<pre>" + get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"]) + "</pre>" else: out += "<pre>" + get_fieldvalues_alephseq_like(recID, ot) + "</pre>" elif format == "hd": # HTML detailed format if record_exist_p == -1: out += _("The record has been deleted.") else: # look for detailed format existence: query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s" res = run_sql(query, (recID, format), 1) if res: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format', so try to call BibFormat on the fly or use default format: out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) if out_record_in_format: out += out_record_in_format else: out += websearch_templates.tmpl_print_record_detailed( ln = ln, recID = recID, ) elif format.startswith("hb_") or format.startswith("hd_"): # underscore means that HTML brief/detailed formats should be called on-the-fly; suitable for testing formats if record_exist_p == -1: out += _("The record has been deleted.") else: out += call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) elif format.startswith("hx"): # BibTeX format, called on the fly: if record_exist_p == -1: out += _("The record has been deleted.") else: out += call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) elif format.startswith("hs"): # for citation/download similarity navigation links: if record_exist_p == -1: out += _("The record has been deleted.") else: out += '<a href="%s">' % websearch_templates.build_search_url(recid=recID, ln=ln) # firstly, title: titles = get_fieldvalues(recID, "245__a") if titles: for title in titles: out += "<strong>%s</strong>" % title else: # usual title not found, try conference title: titles = get_fieldvalues(recID, "111__a") if titles: for title in titles: out += "<strong>%s</strong>" % title else: # just print record ID: out += "<strong>%s %d</strong>" % (get_field_i18nname("record ID", ln, False), recID) out += "</a>" # secondly, authors: authors = get_fieldvalues(recID, "100__a") + get_fieldvalues(recID, "700__a") if authors: out += " - %s" % authors[0] if len(authors) > 1: out += " <em>et al</em>" # thirdly publication info: publinfos = get_fieldvalues(recID, "773__s") if not publinfos: publinfos = get_fieldvalues(recID, "909C4s") if not publinfos: publinfos = get_fieldvalues(recID, "037__a") if not publinfos: publinfos = get_fieldvalues(recID, "088__a") if publinfos: out += " - %s" % publinfos[0] else: # fourthly publication year (if not publication info): years = get_fieldvalues(recID, "773__y") if not years: years = get_fieldvalues(recID, "909C4y") if not years: years = get_fieldvalues(recID, "260__c") if years: out += " (%s)" % years[0] else: # HTML brief format by default if record_exist_p == -1: out += _("The record has been deleted.") else: query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s" res = run_sql(query, (recID, format)) if res: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format', so try to call BibFormat on the fly: or use default format: if CFG_WEBSEARCH_CALL_BIBFORMAT: out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) if out_record_in_format: out += out_record_in_format else: out += websearch_templates.tmpl_print_record_brief( ln = ln, recID = recID, ) else: out += websearch_templates.tmpl_print_record_brief( ln = ln, recID = recID, ) # at the end of HTML brief mode, print the "Detailed record" functionality: if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"): pass # do nothing for portfolio and on-the-fly formats else: out += websearch_templates.tmpl_print_record_brief_links( ln = ln, recID = recID, ) # print record closing tags, if needed: if format == "marcxml" or format == "oai_dc": out += " </metadata>\n" out += " </record>\n" return out def call_bibformat(recID, format="HD", ln=CFG_SITE_LANG, search_pattern=None, user_info=None, verbose=0): """ Calls BibFormat and returns formatted record. BibFormat will decide by itself if old or new BibFormat must be used. """ keywords = [] if search_pattern is not None: units = create_basic_search_units(None, str(search_pattern), None) keywords = [unit[1] for unit in units if unit[0] != '-'] return format_record(recID, of=format, ln=ln, search_pattern=keywords, user_info=user_info, verbose=verbose) def log_query(hostname, query_args, uid=-1): """ Log query into the query and user_query tables. Return id_query or None in case of problems. """ id_query = None if uid >= 0: # log the query only if uid is reasonable res = run_sql("SELECT id FROM query WHERE urlargs=%s", (query_args,), 1) try: id_query = res[0][0] except: id_query = run_sql("INSERT INTO query (type, urlargs) VALUES ('r', %s)", (query_args,)) if id_query: run_sql("INSERT INTO user_query (id_user, id_query, hostname, date) VALUES (%s, %s, %s, %s)", (uid, id_query, hostname, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) return id_query def log_query_info(action, p, f, colls, nb_records_found_total=-1): """Write some info to the log file for later analysis.""" try: log = open(CFG_LOGDIR + "/search.log", "a") log.write(time.strftime("%Y%m%d%H%M%S#", time.localtime())) log.write(action+"#") log.write(p+"#") log.write(f+"#") for coll in colls[:-1]: log.write("%s," % coll) log.write("%s#" % colls[-1]) log.write("%d" % nb_records_found_total) log.write("\n") log.close() except: pass return def wash_url_argument(var, new_type): """Wash list argument into 'new_type', that can be 'list', 'str', or 'int'. Useful for washing mod_python passed arguments, that are all lists of strings (URL args may be multiple), but we sometimes want only to take the first value, and sometimes to represent it as string or numerical value.""" out = [] if new_type == 'list': # return lst if type(var) is list: out = var else: out = [var] elif new_type == 'str': # return str if type(var) is list: try: out = "%s" % var[0] except: out = "" elif type(var) is str: out = var else: out = "%s" % var elif new_type == 'int': # return int if type(var) is list: try: out = string.atoi(var[0]) except: out = 0 elif type(var) is int: out = var elif type(var) is str: try: out = string.atoi(var) except: out = 0 else: out = 0 return out ### CALLABLES -def perform_request_search(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=10, sf="", so="d", sp="", rm="", of="id", ot="", as=0, +def perform_request_search(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=10, sf="", so="d", sp="", rm="", of="id", ot="", aas=0, p1="", f1="", m1="", op1="", p2="", f2="", m2="", op2="", p3="", f3="", m3="", sc=0, jrec=0, recid=-1, recidb=-1, sysno="", id=-1, idb=-1, sysnb="", action="", d1="", d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0, dt="", verbose=0, ap=0, ln=CFG_SITE_LANG, ec=None, tab=""): """Perform search or browse request, without checking for authentication. Return list of recIDs found, if of=id. Otherwise create web page. The arguments are as follows: req - mod_python Request class instance. cc - current collection (e.g. "ATLAS"). The collection the user started to search/browse from. c - collection list (e.g. ["Theses", "Books"]). The collections user may have selected/deselected when starting to search from 'cc'. p - pattern to search for (e.g. "ellis and muon or kaon"). f - field to search within (e.g. "author"). rg - records in groups of (e.g. "10"). Defines how many hits per collection in the search results page are displayed. sf - sort field (e.g. "title"). so - sort order ("a"=ascending, "d"=descending). sp - sort pattern (e.g. "CERN-") -- in case there are more values in a sort field, this argument tells which one to prefer rm - ranking method (e.g. "jif"). Defines whether results should be ranked by some known ranking method. of - output format (e.g. "hb"). Usually starting "h" means HTML output (and "hb" for HTML brief, "hd" for HTML detailed), "x" means XML output, "t" means plain text output, "id" means no output at all but to return list of recIDs found. (Suitable for high-level API.) ot - output only these MARC tags (e.g. "100,700,909C0b"). Useful if only some fields are to be shown in the output, e.g. for library to control some fields. - as - advanced search ("0" means no, "1" means yes). Whether + aas - advanced search ("0" means no, "1" means yes). Whether search was called from within the advanced search interface. p1 - first pattern to search for in the advanced search interface. Much like 'p'. f1 - first field to search within in the advanced search interface. Much like 'f'. m1 - first matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). op1 - first operator, to join the first and the second unit in the advanced search interface. ("a" add, "o" or, "n" not). p2 - second pattern to search for in the advanced search interface. Much like 'p'. f2 - second field to search within in the advanced search interface. Much like 'f'. m2 - second matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). op2 - second operator, to join the second and the third unit in the advanced search interface. ("a" add, "o" or, "n" not). p3 - third pattern to search for in the advanced search interface. Much like 'p'. f3 - third field to search within in the advanced search interface. Much like 'f'. m3 - third matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). sc - split by collection ("0" no, "1" yes). Governs whether we want to present the results in a single huge list, or splitted by collection. jrec - jump to record (e.g. "234"). Used for navigation inside the search results. recid - display record ID (e.g. "20000"). Do not search/browse but go straight away to the Detailed record page for the given recID. recidb - display record ID bis (e.g. "20010"). If greater than 'recid', then display records from recid to recidb. Useful for example for dumping records from the database for reformatting. sysno - display old system SYS number (e.g. ""). If you migrate to CDS Invenio from another system, and store your old SYS call numbers, you can use them instead of recid if you wish so. id - the same as recid, in case recid is not set. For backwards compatibility. idb - the same as recid, in case recidb is not set. For backwards compatibility. sysnb - the same as sysno, in case sysno is not set. For backwards compatibility. action - action to do. "SEARCH" for searching, "Browse" for browsing. Default is to search. d1 - first datetime in full YYYY-mm-dd HH:MM:DD format (e.g. "1998-08-23 12:34:56"). Useful for search limits on creation/modification date (see 'dt' argument below). Note that 'd1' takes precedence over d1y, d1m, d1d if these are defined. d1y - first date's year (e.g. "1998"). Useful for search limits on creation/modification date. d1m - first date's month (e.g. "08"). Useful for search limits on creation/modification date. d1d - first date's day (e.g. "23"). Useful for search limits on creation/modification date. d2 - second datetime in full YYYY-mm-dd HH:MM:DD format (e.g. "1998-09-02 12:34:56"). Useful for search limits on creation/modification date (see 'dt' argument below). Note that 'd2' takes precedence over d2y, d2m, d2d if these are defined. d2y - second date's year (e.g. "1998"). Useful for search limits on creation/modification date. d2m - second date's month (e.g. "09"). Useful for search limits on creation/modification date. d2d - second date's day (e.g. "02"). Useful for search limits on creation/modification date. dt - first and second date's type (e.g. "c"). Specifies whether to search in creation dates ("c") or in modification dates ("m"). When dt is not set and d1* and d2* are set, the default is "c". verbose - verbose level (0=min, 9=max). Useful to print some internal information on the searching process in case something goes wrong. ap - alternative patterns (0=no, 1=yes). In case no exact match is found, the search engine can try alternative patterns e.g. to replace non-alphanumeric characters by a boolean query. ap defines if this is wanted. ln - language of the search interface (e.g. "en"). Useful for internationalization. ec - list of external search engines to search as well (e.g. "SPIRES HEP"). """ selected_external_collections_infos = None # wash output format: of = wash_output_format(of) # for every search engine request asking for an HTML output, we # first regenerate cache of collection and field I18N names if # needed; so that later we won't bother checking timestamps for # I18N names at all: if of.startswith("h"): collection_i18nname_cache.recreate_cache_if_needed() field_i18nname_cache.recreate_cache_if_needed() # wash all arguments requiring special care try: (cc, colls_to_display, colls_to_search) = wash_colls(cc, c, sc) # which colls to search and to display? except InvenioWebSearchUnknownCollectionError, exc: colname = exc.colname if of.startswith("h"): - page_start(req, of, cc, as, ln, getUid(req), + page_start(req, of, cc, aas, ln, getUid(req), websearch_templates.tmpl_collection_not_found_page_title(colname, ln)) req.write(websearch_templates.tmpl_collection_not_found_page_body(colname, ln)) return page_end(req, of, ln) elif of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) else: return page_end(req, of, ln) p = wash_pattern(p) f = wash_field(f) p1 = wash_pattern(p1) f1 = wash_field(f1) p2 = wash_pattern(p2) f2 = wash_field(f2) p3 = wash_pattern(p3) f3 = wash_field(f3) datetext1, datetext2 = wash_dates(d1, d1y, d1m, d1d, d2, d2y, d2m, d2d) # wash ranking method: if not is_method_valid(None, rm): rm = "" _ = gettext_set_language(ln) # backwards compatibility: id, idb, sysnb -> recid, recidb, sysno (if applicable) if sysnb != "" and sysno == "": sysno = sysnb if id > 0 and recid == -1: recid = id if idb > 0 and recidb == -1: recidb = idb # TODO deduce passed search limiting criterias (if applicable) pl, pl_in_url = "", "" # no limits by default if action != "browse" and req and req.args: # we do not want to add options while browsing or while calling via command-line fieldargs = cgi.parse_qs(req.args) for fieldcode in get_fieldcodes(): if fieldargs.has_key(fieldcode): for val in fieldargs[fieldcode]: pl += "+%s:\"%s\" " % (fieldcode, val) pl_in_url += "&%s=%s" % (urllib.quote(fieldcode), urllib.quote(val)) # deduce recid from sysno argument (if applicable): if sysno: # ALEPH SYS number was passed, so deduce DB recID for the record: recid = get_mysql_recid_from_aleph_sysno(sysno) if recid is None: recid = 0 # use recid 0 to indicate that this sysno does not exist # deduce collection we are in (if applicable): if recid > 0: referer = None if req: referer = req.headers_in.get('Referer') cc = guess_collection_of_a_record(recid, referer) # deduce user id (if applicable): try: uid = getUid(req) except: uid = 0 ## 0 - start output if recid >= 0: # recid can be 0 if deduced from sysno and if such sysno does not exist ## 1 - detailed record display title, description, keywords = \ websearch_templates.tmpl_record_page_header_content(req, recid, ln) if req is not None and not req.header_only: - page_start(req, of, cc, as, ln, uid, title, description, keywords, recid, tab) + page_start(req, of, cc, aas, ln, uid, title, description, keywords, recid, tab) # Default format is hb but we are in detailed -> change 'of' if of == "hb": of = "hd" if record_exists(recid): if recidb <= recid: # sanity check recidb = recid + 1 if of == "id": return [recidx for recidx in range(recid, recidb) if record_exists(recidx)] else: print_records(req, range(recid, recidb), -1, -9999, of, ot, ln, search_pattern=p, verbose=verbose, tab=tab) if req and of.startswith("h"): # register detailed record page view event client_ip_address = str(req.get_remote_host(apache.REMOTE_NOLOOKUP)) register_page_view_event(recid, uid, client_ip_address) else: # record does not exist if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) elif of.startswith("h"): if req.header_only: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND else: print_warning(req, _("Requested record does not seem to exist.")) elif action == "browse": ## 2 - browse needed of = 'hb' - page_start(req, of, cc, as, ln, uid, _("Browse"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) - req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, as, ln, p1, f1, m1, op1, + page_start(req, of, cc, aas, ln, uid, _("Browse"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) + req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) try: - if as == 1 or (p1 or p2 or p3): + if aas == 1 or (p1 or p2 or p3): browse_pattern(req, colls_to_search, p1, f1, rg, ln) browse_pattern(req, colls_to_search, p2, f2, rg, ln) browse_pattern(req, colls_to_search, p3, f3, rg, ln) else: browse_pattern(req, colls_to_search, p, f, rg, ln) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) elif rm and p.startswith("recid:"): ## 3-ter - similarity search or citation search needed if not req.header_only: - page_start(req, of, cc, as, ln, uid, _("Search Results"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) + page_start(req, of, cc, aas, ln, uid, _("Search Results"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) if of.startswith("h"): - req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, as, ln, p1, f1, m1, op1, + req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) if record_exists(p[6:]) != 1: # record does not exist if of.startswith("h"): if req.header_only: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND else: print_warning(req, "Requested record does not seem to exist.") if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: # record well exists, so find similar ones to it t1 = os.times()[4] results_similar_recIDs, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, results_similar_comments = \ rank_records(rm, 0, get_collection_reclist(cc), string.split(p), verbose) if results_similar_recIDs: t2 = os.times()[4] cpu_time = t2 - t1 if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, cc, len(results_similar_recIDs), - jrec, rg, as, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, + jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) print_warning(req, results_similar_comments) print_records(req, results_similar_recIDs, jrec, rg, of, ot, ln, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, search_pattern=p, verbose=verbose) elif of=="id": return results_similar_recIDs elif of.startswith("x"): print_records(req, results_similar_recIDs, jrec, rg, of, ot, ln, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, search_pattern=p, verbose=verbose) else: # rank_records failed and returned some error message to display: if of.startswith("h"): print_warning(req, results_similar_relevances_prologue) print_warning(req, results_similar_relevances_epilogue) print_warning(req, results_similar_comments) if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) elif p.startswith("cocitedwith:"): #WAS EXPERIMENTAL ## 3-terter - cited by search needed - page_start(req, of, cc, as, ln, uid, _("Search Results"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) + page_start(req, of, cc, aas, ln, uid, _("Search Results"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) if of.startswith("h"): - req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, as, ln, p1, f1, m1, op1, + req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) recID = p[12:] if record_exists(recID) != 1: # record does not exist if of.startswith("h"): print_warning(req, "Requested record does not seem to exist.") if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: # record well exists, so find co-cited ones: t1 = os.times()[4] results_cocited_recIDs = map(lambda x: x[0], calculate_co_cited_with_list(int(recID))) if results_cocited_recIDs: t2 = os.times()[4] cpu_time = t2 - t1 if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, CFG_SITE_NAME, len(results_cocited_recIDs), - jrec, rg, as, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, + jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) print_records(req, results_cocited_recIDs, jrec, rg, of, ot, ln, search_pattern=p, verbose=verbose) elif of=="id": return results_cocited_recIDs elif of.startswith("x"): print_records(req, results_cocited_recIDs, jrec, rg, of, ot, ln, search_pattern=p, verbose=verbose) else: # cited rank_records failed and returned some error message to display: if of.startswith("h"): print_warning(req, "nothing found") if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: ## 3 - common search needed query_in_cache = False query_representation_in_cache = repr((p,f,colls_to_search)) - page_start(req, of, cc, as, ln, uid, p=create_page_title_search_pattern_info(p, p1, p2, p3)) + page_start(req, of, cc, aas, ln, uid, p=create_page_title_search_pattern_info(p, p1, p2, p3)) if of.startswith("h"): - req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, as, ln, p1, f1, m1, op1, + req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) t1 = os.times()[4] results_in_any_collection = HitSet() - if as == 1 or (p1 or p2 or p3): + if aas == 1 or (p1 or p2 or p3): ## 3A - advanced search try: results_in_any_collection = search_pattern_parenthesised(req, p1, f1, m1, ap=ap, of=of, verbose=verbose, ln=ln) if len(results_in_any_collection) == 0: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) if p2: results_tmp = search_pattern_parenthesised(req, p2, f2, m2, ap=ap, of=of, verbose=verbose, ln=ln) if op1 == "a": # add results_in_any_collection.intersection_update(results_tmp) elif op1 == "o": # or results_in_any_collection.union_update(results_tmp) elif op1 == "n": # not results_in_any_collection.difference_update(results_tmp) else: if of.startswith("h"): print_warning(req, "Invalid set operation %s." % cgi.escape(op1), "Error") if len(results_in_any_collection) == 0: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) if p3: results_tmp = search_pattern_parenthesised(req, p3, f3, m3, ap=ap, of=of, verbose=verbose, ln=ln) if op2 == "a": # add results_in_any_collection.intersection_update(results_tmp) elif op2 == "o": # or results_in_any_collection.union_update(results_tmp) elif op2 == "n": # not results_in_any_collection.difference_update(results_tmp) else: if of.startswith("h"): print_warning(req, "Invalid set operation %s." % cgi.escape(op2), "Error") except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) else: ## 3B - simple search if search_results_cache.cache.has_key(query_representation_in_cache): # query is not in the cache already, so reuse it: query_in_cache = True results_in_any_collection = search_results_cache.cache[query_representation_in_cache] if verbose and of.startswith("h"): print_warning(req, "Search stage 0: query found in cache, reusing cached results.") else: try: results_in_any_collection = search_pattern_parenthesised(req, p, f, ap=ap, of=of, verbose=verbose, ln=ln) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if len(results_in_any_collection) == 0: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) # store this search query results into search results cache if needed: if CFG_WEBSEARCH_SEARCH_CACHE_SIZE and not query_in_cache: if len(search_results_cache.cache) > CFG_WEBSEARCH_SEARCH_CACHE_SIZE: search_results_cache.clear() search_results_cache.cache[query_representation_in_cache] = results_in_any_collection if verbose and of.startswith("h"): print_warning(req, "Search stage 3: storing query results in cache.") # search stage 4: intersection with collection universe: try: results_final = intersect_results_with_collrecs(req, results_in_any_collection, colls_to_search, ap, of, verbose, ln) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if results_final == {}: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) if of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) # search stage 5: apply search option limits and restrictions: if datetext1 != "": if verbose and of.startswith("h"): print_warning(req, "Search stage 5: applying time etc limits, from %s until %s..." % (datetext1, datetext2)) try: results_final = intersect_results_with_hitset(req, results_final, search_unit_in_bibrec(datetext1, datetext2, dt), ap, aptext= _("No match within your time limits, " "discarding this condition..."), of=of) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if results_final == {}: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if pl: pl = wash_pattern(pl) if verbose and of.startswith("h"): print_warning(req, "Search stage 5: applying search pattern limit %s..." % cgi.escape(pl)) try: results_final = intersect_results_with_hitset(req, results_final, search_pattern_parenthesised(req, pl, ap=0, ln=ln), ap, aptext=_("No match within your search limits, " "discarding this condition..."), of=of) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if results_final == {}: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) if of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) t2 = os.times()[4] cpu_time = t2 - t1 ## search stage 6: display results: results_final_nb_total = 0 results_final_nb = {} # will hold number of records found in each collection # (in simple dict to display overview more easily) for coll in results_final.keys(): results_final_nb[coll] = len(results_final[coll]) #results_final_nb_total += results_final_nb[coll] # Now let us calculate results_final_nb_total more precisely, # in order to get the total number of "distinct" hits across # searched collections; this is useful because a record might # have been attributed to more than one primary collection; so # we have to avoid counting it multiple times. The price to # pay for this accuracy of results_final_nb_total is somewhat # increased CPU time. if results_final.keys() == 1: # only one collection; no need to union them results_final_for_all_selected_colls = results_final.values()[0] results_final_nb_total = results_final_nb.values()[0] else: # okay, some work ahead to union hits across collections: results_final_for_all_selected_colls = HitSet() for coll in results_final.keys(): results_final_for_all_selected_colls.union_update(results_final[coll]) results_final_nb_total = len(results_final_for_all_selected_colls) if results_final_nb_total == 0: if of.startswith('h'): print_warning(req, "No match found, please enter different search terms.") elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: # yes, some hits found: good! # collection list may have changed due to not-exact-match-found policy so check it out: for coll in results_final.keys(): if coll not in colls_to_search: colls_to_search.append(coll) # print results overview: if of == "id": # we have been asked to return list of recIDs recIDs = list(results_final_for_all_selected_colls) if sf: # do we have to sort? recIDs = sort_records(req, recIDs, sf, so, sp, verbose, of) elif rm: # do we have to rank? results_final_for_all_colls_rank_records_output = rank_records(rm, 0, results_final_for_all_selected_colls, string.split(p) + string.split(p1) + string.split(p2) + string.split(p3), verbose) if results_final_for_all_colls_rank_records_output[0]: recIDs = results_final_for_all_colls_rank_records_output[0] return recIDs elif of.startswith("h"): if of not in ['hcs']: req.write(print_results_overview(req, colls_to_search, results_final_nb_total, results_final_nb, cpu_time, ln, ec)) selected_external_collections_infos = print_external_results_overview(req, cc, [p, p1, p2, p3], f, ec, verbose, ln) # print number of hits found for XML outputs: if of.startswith("x"): req.write("<!-- Search-Engine-Total-Number-Of-Results: %s -->\n" % results_final_nb_total) # print records: if of in ['hcs']: # feed the current search to be summarized: from invenio.search_engine_summarizer import summarize_records summarize_records(results_final_for_all_selected_colls, 'hcs', ln, p, f, req) else: if len(colls_to_search)>1: cpu_time = -1 # we do not want to have search time printed on each collection print_records_prologue(req, of) for coll in colls_to_search: if results_final.has_key(coll) and len(results_final[coll]): if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, coll, results_final_nb[coll], - jrec, rg, as, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, + jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) results_final_recIDs = list(results_final[coll]) results_final_relevances = [] results_final_relevances_prologue = "" results_final_relevances_epilogue = "" if sf: # do we have to sort? results_final_recIDs = sort_records(req, results_final_recIDs, sf, so, sp, verbose, of) elif rm: # do we have to rank? results_final_recIDs_ranked, results_final_relevances, results_final_relevances_prologue, results_final_relevances_epilogue, results_final_comments = \ rank_records(rm, 0, results_final[coll], string.split(p) + string.split(p1) + string.split(p2) + string.split(p3), verbose) if of.startswith("h"): print_warning(req, results_final_comments) if results_final_recIDs_ranked: results_final_recIDs = results_final_recIDs_ranked else: # rank_records failed and returned some error message to display: print_warning(req, results_final_relevances_prologue) print_warning(req, results_final_relevances_epilogue) print_records(req, results_final_recIDs, jrec, rg, of, ot, ln, results_final_relevances, results_final_relevances_prologue, results_final_relevances_epilogue, search_pattern=p, print_records_prologue_p=False, print_records_epilogue_p=False, verbose=verbose) if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, coll, results_final_nb[coll], - jrec, rg, as, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, + jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) print_records_epilogue(req, of) if f == "author" and of.startswith("h"): req.write(create_similarly_named_authors_link_box(p, ln)) # log query: try: id_query = log_query(req.get_remote_host(), req.args, uid) if of.startswith("h") and id_query: if not of in ['hcs']: # display alert/RSS teaser for non-summary formats: req.write(websearch_templates.tmpl_alert_rss_teaser_box_for_query(id_query, ln=ln)) except: # do not log query if req is None (used by CLI interface) pass log_query_info("ss", p, f, colls_to_search, results_final_nb_total) # External searches if of.startswith("h"): if not of in ['hcs']: perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) def perform_request_cache(req, action="show"): """Manipulates the search engine cache.""" req.content_type = "text/html" req.send_http_header() req.write("<html>") out = "" out += "<h1>Search Cache</h1>" # clear cache if requested: if action == "clear": search_results_cache.clear() req.write(out) # show collection reclist cache: out = "<h3>Collection reclist cache</h3>" out += "- collection table last updated: %s" % get_table_update_time('collection') out += "<br />- reclist cache timestamp: %s" % collection_reclist_cache.timestamp out += "<br />- reclist cache contents:" out += "<blockquote>" for coll in collection_reclist_cache.cache.keys(): if collection_reclist_cache.cache[coll]: out += "%s (%d)<br />" % (coll, len(collection_reclist_cache.cache[coll])) out += "</blockquote>" req.write(out) # show search results cache: out = "<h3>Search Cache</h3>" out += "- search cache usage: %d queries cached (max. ~%d)" % \ (len(search_results_cache.cache), CFG_WEBSEARCH_SEARCH_CACHE_SIZE) if len(search_results_cache.cache): out += "<br />- search cache contents:" out += "<blockquote>" for query, hitset in search_results_cache.cache.items(): out += "<br />%s ... %s" % (query, hitset) out += """<p><a href="%s/search/cache?action=clear">clear search results cache</a>""" % CFG_SITE_URL out += "</blockquote>" req.write(out) # show field i18nname cache: out = "<h3>Field I18N names cache</h3>" out += "- fieldname table last updated: %s" % get_table_update_time('fieldname') out += "<br />- i18nname cache timestamp: %s" % field_i18nname_cache.timestamp out += "<br />- i18nname cache contents:" out += "<blockquote>" for field in field_i18nname_cache.cache.keys(): for ln in field_i18nname_cache.cache[field].keys(): out += "%s, %s = %s<br />" % (field, ln, field_i18nname_cache.cache[field][ln]) out += "</blockquote>" req.write(out) # show collection i18nname cache: out = "<h3>Collection I18N names cache</h3>" out += "- collectionname table last updated: %s" % get_table_update_time('collectionname') out += "<br />- i18nname cache timestamp: %s" % collection_i18nname_cache.timestamp out += "<br />- i18nname cache contents:" out += "<blockquote>" for coll in collection_i18nname_cache.cache.keys(): for ln in collection_i18nname_cache.cache[coll].keys(): out += "%s, %s = %s<br />" % (coll, ln, collection_i18nname_cache.cache[coll][ln]) out += "</blockquote>" req.write(out) req.write("</html>") return "\n" def perform_request_log(req, date=""): """Display search log information for given date.""" req.content_type = "text/html" req.send_http_header() req.write("<html>") req.write("<h1>Search Log</h1>") if date: # case A: display stats for a day yyyymmdd = string.atoi(date) req.write("<p><big><strong>Date: %d</strong></big><p>" % yyyymmdd) req.write("""<table border="1">""") req.write("<tr><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td><td><strong>%s</strong></td></tr>" % ("No.", "Time", "Pattern", "Field", "Collection", "Number of Hits")) # read file: p = os.popen("grep ^%d %s/search.log" % (yyyymmdd, CFG_LOGDIR), 'r') lines = p.readlines() p.close() # process lines: i = 0 for line in lines: try: - datetime, as, p, f, c, nbhits = string.split(line,"#") + datetime, aas, p, f, c, nbhits = string.split(line,"#") i += 1 req.write("<tr><td align=\"right\">#%d</td><td>%s:%s:%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" \ % (i, datetime[8:10], datetime[10:12], datetime[12:], p, f, c, nbhits)) except: pass # ignore eventual wrong log lines req.write("</table>") else: # case B: display summary stats per day yyyymm01 = int(time.strftime("%Y%m01", time.localtime())) yyyymmdd = int(time.strftime("%Y%m%d", time.localtime())) req.write("""<table border="1">""") req.write("<tr><td><strong>%s</strong></td><td><strong>%s</strong></tr>" % ("Day", "Number of Queries")) for day in range(yyyymm01, yyyymmdd + 1): p = os.popen("grep -c ^%d %s/search.log" % (day, CFG_LOGDIR), 'r') for line in p.readlines(): req.write("""<tr><td>%s</td><td align="right"><a href="%s/search/log?date=%d">%s</a></td></tr>""" % \ (day, CFG_SITE_URL, day, line)) p.close() req.write("</table>") req.write("</html>") return "\n" def get_most_popular_field_values(recids, tags, exclude_values=None, count_repetitive_values=True): """ Analyze RECIDS and look for TAGS and return most popular values and the frequency with which they occur sorted according to descending frequency. If a value is found in EXCLUDE_VALUES, then do not count it. If COUNT_REPETITIVE_VALUES is True, then we count every occurrence of value in the tags. If False, then we count the value only once regardless of the number of times it may appear in a record. (But, if the same value occurs in another record, we count it, of course.) Example: >>> get_most_popular_field_values(range(11,20), '980__a') (('PREPRINT', 10), ('THESIS', 7), ...) >>> get_most_popular_field_values(range(11,20), ('100__a', '700__a')) (('Ellis, J', 10), ('Ellis, N', 7), ...) >>> get_most_popular_field_values(range(11,20), ('100__a', '700__a'), ('Ellis, J')) (('Ellis, N', 7), ...) """ def _get_most_popular_field_values_helper_sorter(val1, val2): "Compare VAL1 and VAL2 according to, firstly, frequency, then secondly, alphabetically." compared_via_frequencies = cmp(valuefreqdict[val2], valuefreqdict[val1]) if compared_via_frequencies == 0: return cmp(val1.lower(), val2.lower()) else: return compared_via_frequencies valuefreqdict = {} ## sanity check: if not exclude_values: exclude_values = [] if isinstance(tags, str): tags = (tags,) ## find values to count: vals_to_count = [] if count_repetitive_values: # counting technique A: can look up many records at once: (very fast) for tag in tags: vals_to_count.extend(get_fieldvalues(recids, tag)) else: # counting technique B: must count record-by-record: (slow) for recid in recids: vals_in_rec = [] for tag in tags: for val in get_fieldvalues(recid, tag, False): vals_in_rec.append(val) # do not count repetitive values within this record # (even across various tags, so need to unify again): dtmp = {} for val in vals_in_rec: dtmp[val] = 1 vals_in_rec = dtmp.keys() vals_to_count.extend(vals_in_rec) ## are we to exclude some of found values? for val in vals_to_count: if val not in exclude_values: if valuefreqdict.has_key(val): valuefreqdict[val] += 1 else: valuefreqdict[val] = 1 ## sort by descending frequency of values: out = () vals = valuefreqdict.keys() vals.sort(_get_most_popular_field_values_helper_sorter) for val in vals: out += (val, valuefreqdict[val]), return out def profile(p="", f="", c=CFG_SITE_NAME): """Profile search time.""" import profile import pstats profile.run("perform_request_search(p='%s',f='%s', c='%s')" % (p, f, c), "perform_request_search_profile") p = pstats.Stats("perform_request_search_profile") p.strip_dirs().sort_stats("cumulative").print_stats() return 0 ## test cases: #print wash_colls(CFG_SITE_NAME,"Library Catalogue", 0) #print wash_colls("Periodicals & Progress Reports",["Periodicals","Progress Reports"], 0) #print wash_field("wau") #print print_record(20,"tm","001,245") #print create_opft_search_units(None, "PHE-87-13","reportnumber") #print ":"+wash_pattern("* and % doo * %")+":\n" #print ":"+wash_pattern("*")+":\n" #print ":"+wash_pattern("ellis* ell* e*%")+":\n" #print run_sql("SELECT name,dbquery from collection") #print get_index_id("author") #print get_coll_ancestors("Theses") #print get_coll_sons("Articles & Preprints") #print get_coll_real_descendants("Articles & Preprints") #print get_collection_reclist("Theses") #print log(sys.stdin) #print search_unit_in_bibrec('2002-12-01','2002-12-12') #print type(wash_url_argument("-1",'int')) #print get_nearest_terms_in_bibxxx("ellis", "author", 5, 5) #print call_bibformat(68, "HB_FLY") #print get_fieldvalues(10, "980__a") #print get_fieldvalues_alephseq_like(10,"001___") #print get_fieldvalues_alephseq_like(10,"980__a") #print get_fieldvalues_alephseq_like(10,"foo") #print get_fieldvalues_alephseq_like(10,"-1") #print get_fieldvalues_alephseq_like(10,"99") #print get_fieldvalues_alephseq_like(10,["001", "980"]) ## profiling: #profile("of the this") #print perform_request_search(p="ellis") diff --git a/modules/websearch/lib/websearch_regression_tests.py b/modules/websearch/lib/websearch_regression_tests.py index 44de8d96b..334fddeda 100644 --- a/modules/websearch/lib/websearch_regression_tests.py +++ b/modules/websearch/lib/websearch_regression_tests.py @@ -1,1461 +1,1465 @@ # -*- coding: utf-8 -*- ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable-msg=C0301 # pylint: disable-msg=E1102 """WebSearch module regression tests.""" __revision__ = "$Id$" import unittest import re import urlparse, cgi from sets import Set from mechanize import Browser, LinkNotFoundError, HTTPError from invenio.config import CFG_SITE_URL, CFG_SITE_NAME, CFG_SITE_LANG from invenio.testutils import make_test_suite, \ run_test_suite, \ make_url, test_web_page_content, \ merge_error_messages from invenio.urlutils import same_urls_p from invenio.search_engine import perform_request_search, \ guess_primary_collection_of_a_record, guess_collection_of_a_record, \ collection_restricted_p, get_permitted_restricted_collections, \ get_fieldvalues def parse_url(url): parts = urlparse.urlparse(url) query = cgi.parse_qs(parts[4], True) return parts[2].split('/')[1:], query class WebSearchWebPagesAvailabilityTest(unittest.TestCase): """Check WebSearch web pages whether they are up or not.""" def test_search_interface_pages_availability(self): """websearch - availability of search interface pages""" baseurl = CFG_SITE_URL + '/' _exports = ['', 'collection/Poetry', 'collection/Poetry?as=1'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_search_results_pages_availability(self): """websearch - availability of search results pages""" baseurl = CFG_SITE_URL + '/search' _exports = ['', '?c=Poetry', '?p=ellis', '/cache', '/log'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_search_detailed_record_pages_availability(self): """websearch - availability of search detailed record pages""" baseurl = CFG_SITE_URL + '/record/' _exports = ['', '1', '1/', '1/files', '1/files/'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_browse_results_pages_availability(self): """websearch - availability of browse results pages""" baseurl = CFG_SITE_URL + '/search' _exports = ['?p=ellis&f=author&action_browse=Browse'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_help_page_availability(self): """websearch - availability of Help Central page""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help', expected_text="Help Central")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/?ln=fr', expected_text="Centre d'aide")) def test_search_tips_page_availability(self): """websearch - availability of Search Tips""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-tips', expected_text="Search Tips")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-tips?ln=fr', expected_text="Conseils de recherche")) def test_search_guide_page_availability(self): """websearch - availability of Search Guide""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-guide', expected_text="Search Guide")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-guide?ln=fr', expected_text="Guide de recherche")) class WebSearchTestLegacyURLs(unittest.TestCase): """ Check that the application still responds to legacy URLs for navigating, searching and browsing.""" def test_legacy_collections(self): """ websearch - collections handle legacy urls """ browser = Browser() def check(legacy, new, browser=browser): browser.open(legacy) got = browser.geturl() self.failUnless(same_urls_p(got, new), got) # Use the root URL unless we need more check(make_url('/', c=CFG_SITE_NAME), make_url('/', ln=CFG_SITE_LANG)) # Other collections are redirected in the /collection area check(make_url('/', c='Poetry'), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) # Drop unnecessary arguments, like ln and as (when they are # the default value) - check(make_url('/', c='Poetry', as=0), + args = {'as': 0} + check(make_url('/', c='Poetry', **args), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) # Otherwise, keep them - check(make_url('/', c='Poetry', as=1), - make_url('/collection/Poetry', as=1, ln=CFG_SITE_LANG)) + args = {'as': 1, 'ln': CFG_SITE_LANG} + check(make_url('/', c='Poetry', **args), + make_url('/collection/Poetry', **args)) # Support the /index.py addressing too check(make_url('/index.py', c='Poetry'), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) def test_legacy_search(self): """ websearch - search queries handle legacy urls """ browser = Browser() def check(legacy, new, browser=browser): browser.open(legacy) got = browser.geturl() self.failUnless(same_urls_p(got, new), got) # /search.py is redirected on /search # Note that `as' is a reserved word in Python 2.5 check(make_url('/search.py', p='nuclear', ln='en') + 'as=1', make_url('/search', p='nuclear', ln='en') + 'as=1') # direct recid searches are redirected to /record check(make_url('/search.py', recid=1, ln='es'), make_url('/record/1', ln='es')) def test_legacy_search_help_link(self): """websearch - legacy Search Help page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/index.en.html', expected_text="Help Central")) def test_legacy_search_tips_link(self): """websearch - legacy Search Tips page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/tips.fr.html', expected_text="Conseils de recherche")) def test_legacy_search_guide_link(self): """websearch - legacy Search Guide page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/guide.en.html', expected_text="Search Guide")) class WebSearchTestRecord(unittest.TestCase): """ Check the interface of the /record results """ def test_format_links(self): """ websearch - check format links for records """ browser = Browser() # We open the record in all known HTML formats for hformat in ('hd', 'hx', 'hm'): browser.open(make_url('/record/1', of=hformat)) if hformat == 'hd': # hd format should have a link to the following # formats for oformat in ('hx', 'hm', 'xm', 'xd'): target = make_url('/record/1/export/%s?ln=en' % oformat) try: browser.find_link(url=target) except LinkNotFoundError: self.fail('link %r should be in page' % target) else: # non-hd HTML formats should have a link back to # the main detailed record target = make_url('/record/1') try: browser.find_link(url=target) except LinkNotFoundError: self.fail('link %r should be in page' % target) return def test_exported_formats(self): """ websearch - check formats exported through /record/1/export/ URLs""" browser = Browser() self.assertEqual([], test_web_page_content(make_url('/record/1/export/hm'), expected_text='245__ $$aALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/hd'), expected_text='<strong>ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/xm'), expected_text='<subfield code="a">ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/xd'), expected_text='<dc:title>ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/hs'), expected_text='<a href="/record/1?ln=%s">ALEPH experiment' % \ CFG_SITE_LANG)) self.assertEqual([], test_web_page_content(make_url('/record/1/export/hx'), expected_text='title = "ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/t?ot=245'), expected_text='245__ $$aALEPH experiment')) self.assertNotEqual([], test_web_page_content(make_url('/record/1/export/t?ot=245'), expected_text='001__')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/h?ot=245'), expected_text='245__ $$aALEPH experiment')) self.assertNotEqual([], test_web_page_content(make_url('/record/1/export/h?ot=245'), expected_text='001__')) return class WebSearchTestCollections(unittest.TestCase): def test_traversal_links(self): """ websearch - traverse all the publications of a collection """ browser = Browser() try: - for as in (0, 1): - browser.open(make_url('/collection/Preprints', as=as)) + for aas in (0, 1): + args = {'as': aas} + browser.open(make_url('/collection/Preprints', **args)) for jrec in (11, 21, 11, 27): args = {'jrec': jrec, 'cc': 'Preprints'} - if as: - args['as'] = as + if aas: + args['as'] = aas url = make_url('/search', **args) try: browser.follow_link(url=url) except LinkNotFoundError: args['ln'] = CFG_SITE_LANG url = make_url('/search', **args) browser.follow_link(url=url) except LinkNotFoundError: self.fail('no link %r in %r' % (url, browser.geturl())) def test_collections_links(self): """ websearch - enter in collections and subcollections """ browser = Browser() def tryfollow(url): cur = browser.geturl() body = browser.response().read() try: browser.follow_link(url=url) except LinkNotFoundError: print body self.fail("in %r: could not find %r" % ( cur, url)) return - for as in (0, 1): - if as: + for aas in (0, 1): + if aas: kargs = {'as': 1} else: kargs = {} kargs['ln'] = CFG_SITE_LANG # We navigate from immediate son to immediate son... browser.open(make_url('/', **kargs)) tryfollow(make_url('/collection/Articles%20%26%20Preprints', **kargs)) tryfollow(make_url('/collection/Articles', **kargs)) # But we can also jump to a grandson immediately browser.back() browser.back() tryfollow(make_url('/collection/ALEPH', **kargs)) return def test_records_links(self): """ websearch - check the links toward records in leaf collections """ browser = Browser() browser.open(make_url('/collection/Preprints')) def harvest(): """ Parse all the links in the page, and check that for each link to a detailed record, we also have the corresponding link to the similar records.""" records = Set() similar = Set() for link in browser.links(): path, q = parse_url(link.url) if not path: continue if path[0] == 'record': records.add(int(path[1])) continue if path[0] == 'search': if not q.get('rm') == ['wrd']: continue recid = q['p'][0].split(':')[1] similar.add(int(recid)) self.failUnlessEqual(records, similar) return records # We must have 10 links to the corresponding /records found = harvest() self.failUnlessEqual(len(found), 10) # When clicking on the "Search" button, we must also have # these 10 links on the records. browser.select_form(name="search") browser.submit() found = harvest() self.failUnlessEqual(len(found), 10) return class WebSearchTestBrowse(unittest.TestCase): def test_browse_field(self): """ websearch - check that browsing works """ browser = Browser() browser.open(make_url('/')) browser.select_form(name='search') browser['f'] = ['title'] browser.submit(name='action_browse') def collect(): # We'll get a few links to search for the actual hits, plus a # link to the following results. res = [] for link in browser.links(url_regex=re.compile(CFG_SITE_URL + r'/search\?')): if link.text == 'Advanced Search': continue dummy, q = parse_url(link.url) res.append((link, q)) return res # if we follow the last link, we should get another # batch. There is an overlap of one item. batch_1 = collect() browser.follow_link(link=batch_1[-1][0]) batch_2 = collect() # FIXME: we cannot compare the whole query, as the collection # set is not equal self.failUnlessEqual(batch_1[-2][1]['p'], batch_2[0][1]['p']) class WebSearchTestOpenURL(unittest.TestCase): def test_isbn_01(self): """ websearch - isbn query via OpenURL 0.1""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl', isbn='0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) def test_isbn_10_rft_id(self): """ websearch - isbn query via OpenURL 1.0 - rft_id""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl', rft_id='urn:ISBN:0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) def test_isbn_10(self): """ websearch - isbn query via OpenURL 1.0""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl?rft.isbn=0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) class WebSearchTestSearch(unittest.TestCase): def test_hits_in_other_collection(self): """ websearch - check extension of a query to the home collection """ browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/collection/ISOLDE', ln='en')) browser.select_form(name='search') browser['f'] = ['author'] browser['p'] = 'matsubara' browser.submit() dummy, current_q = parse_url(browser.geturl()) link = browser.find_link(text_regex=re.compile('.*hit', re.I)) dummy, target_q = parse_url(link.url) # the target query should be the current query without any c # or cc specified. for f in ('cc', 'c', 'action_search'): if f in current_q: del current_q[f] self.failUnlessEqual(current_q, target_q) def test_nearest_terms(self): """ websearch - provide a list of nearest terms """ browser = Browser() browser.open(make_url('')) # Search something weird browser.select_form(name='search') browser['p'] = 'gronf' browser.submit() dummy, original = parse_url(browser.geturl()) for to_drop in ('cc', 'action_search', 'f'): if to_drop in original: del original[to_drop] if 'ln' not in original: original['ln'] = [CFG_SITE_LANG] # we should get a few searches back, which are identical # except for the p field being substituted (and the cc field # being dropped). if 'cc' in original: del original['cc'] for link in browser.links(url_regex=re.compile(CFG_SITE_URL + r'/search\?')): if link.text == 'Advanced Search': continue dummy, target = parse_url(link.url) if 'ln' not in target: target['ln'] = [CFG_SITE_LANG] original['p'] = [link.text] self.failUnlessEqual(original, target) return def test_switch_to_simple_search(self): """ websearch - switch to simple search """ browser = Browser() - browser.open(make_url('/collection/ISOLDE', as=1)) + args = {'as': 1} + browser.open(make_url('/collection/ISOLDE', **args)) browser.select_form(name='search') browser['p1'] = 'tandem' browser['f1'] = ['title'] browser.submit() browser.follow_link(text='Simple Search') dummy, q = parse_url(browser.geturl()) self.failUnlessEqual(q, {'cc': ['ISOLDE'], 'p': ['tandem'], 'f': ['title'], 'ln': ['en']}) def test_switch_to_advanced_search(self): """ websearch - switch to advanced search """ browser = Browser() browser.open(make_url('/collection/ISOLDE')) browser.select_form(name='search') browser['p'] = 'tandem' browser['f'] = ['title'] browser.submit() browser.follow_link(text='Advanced Search') dummy, q = parse_url(browser.geturl()) self.failUnlessEqual(q, {'cc': ['ISOLDE'], 'p1': ['tandem'], 'f1': ['title'], 'as': ['1'], 'ln' : ['en']}) def test_no_boolean_hits(self): """ websearch - check the 'no boolean hits' proposed links """ browser = Browser() browser.open(make_url('')) browser.select_form(name='search') browser['p'] = 'quasinormal muon' browser.submit() dummy, q = parse_url(browser.geturl()) for to_drop in ('cc', 'action_search', 'f'): if to_drop in q: del q[to_drop] for bsu in ('quasinormal', 'muon'): l = browser.find_link(text=bsu) q['p'] = bsu if not same_urls_p(l.url, make_url('/search', **q)): self.fail(repr((l.url, make_url('/search', **q)))) def test_similar_authors(self): """ websearch - test similar authors box """ browser = Browser() browser.open(make_url('')) browser.select_form(name='search') browser['p'] = 'Ellis, R K' browser['f'] = ['author'] browser.submit() l = browser.find_link(text="Ellis, R S") self.failUnless(same_urls_p(l.url, make_url('/search', p="Ellis, R S", f='author', ln='en'))) class WebSearchNearestTermsTest(unittest.TestCase): """Check various alternatives of searches leading to the nearest terms box.""" def test_nearest_terms_box_in_okay_query(self): """ websearch - no nearest terms box for a successful query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text="jump to record")) def test_nearest_terms_box_in_unsuccessful_simple_query(self): """ websearch - nearest terms box for unsuccessful simple query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=embed", expected_link_label='embed')) def test_nearest_terms_box_in_unsuccessful_structured_query(self): """ websearch - nearest terms box for unsuccessful structured query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellisz&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=fabbro&f=author", expected_link_label='fabbro')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3Aellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=author%3Afabbro", expected_link_label='fabbro')) def test_nearest_terms_box_in_unsuccessful_phrase_query(self): """ websearch - nearest terms box for unsuccessful phrase query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis%2C+Z%22', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=author%3A%22Enqvist%2C+K%22", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%22ellisz%22&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%22Enqvist%2C+K%22&f=author", expected_link_label='Enqvist, K')) def test_nearest_terms_box_in_unsuccessful_boolean_query(self): """ websearch - nearest terms box for unsuccessful boolean query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3Aellisz+author%3Aellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aellisz", expected_link_label='energi')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3Aenergi+author%3Aenergie', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aenqvist", expected_link_label='enqvist')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?ln=en&p=title%3Aellisz+author%3Aellisz&f=keyword', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aellisz&f=keyword", expected_link_label='energi')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?ln=en&p=title%3Aenergi+author%3Aenergie&f=keyword', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aenqvist&f=keyword", expected_link_label='enqvist')) class WebSearchBooleanQueryTest(unittest.TestCase): """Check various boolean queries.""" def test_successful_boolean_query(self): """ websearch - successful boolean query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis+muon', expected_text="records found", expected_link_label="Detailed record")) def test_unsuccessful_boolean_query_where_all_individual_terms_match(self): """ websearch - unsuccessful boolean query where all individual terms match """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis+muon+letter', expected_text="Boolean query returned no hits. Please combine your search terms differently.")) class WebSearchAuthorQueryTest(unittest.TestCase): """Check various author-related queries.""" def test_propose_similar_author_names_box(self): """ websearch - propose similar author names box """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=Ellis%2C+R&f=author', expected_text="See also: similar author names", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=Ellis%2C+R+K&f=author", expected_link_label="Ellis, R K")) def test_do_not_propose_similar_author_names_box(self): """ websearch - do not propose similar author names box """ errmsgs = test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis%2C+R%22', expected_link_target=CFG_SITE_URL+"/search?ln=en&p=Ellis%2C+R+K&f=author", expected_link_label="Ellis, R K") if errmsgs[0].find("does not contain link to") > -1: pass else: self.fail("Should not propose similar author names box.") return class WebSearchSearchEnginePythonAPITest(unittest.TestCase): """Check typical search engine Python API calls on the demo data.""" def test_search_engine_python_api_for_failed_query(self): """websearch - search engine Python API for failed query""" self.assertEqual([], perform_request_search(p='aoeuidhtns')) def test_search_engine_python_api_for_successful_query(self): """websearch - search engine Python API for successful query""" self.assertEqual([8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 47], perform_request_search(p='ellis')) def test_search_engine_python_api_for_existing_record(self): """websearch - search engine Python API for existing record""" self.assertEqual([8], perform_request_search(recid=8)) def test_search_engine_python_api_for_nonexisting_record(self): """websearch - search engine Python API for non-existing record""" self.assertEqual([], perform_request_search(recid=1234567809)) def test_search_engine_python_api_for_nonexisting_collection(self): """websearch - search engine Python API for non-existing collection""" self.assertEqual([], perform_request_search(c='Foo')) def test_search_engine_python_api_for_range_of_records(self): """websearch - search engine Python API for range of records""" self.assertEqual([1, 2, 3, 4, 5, 6, 7, 8, 9], perform_request_search(recid=1, recidb=10)) class WebSearchSearchEngineWebAPITest(unittest.TestCase): """Check typical search engine Web API calls on the demo data.""" def test_search_engine_web_api_for_failed_query(self): """websearch - search engine Web API for failed query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=aoeuidhtns&of=id', expected_text="[]")) def test_search_engine_web_api_for_successful_query(self): """websearch - search engine Web API for successful query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis&of=id', expected_text="[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 47]")) def test_search_engine_web_api_for_existing_record(self): """websearch - search engine Web API for existing record""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=8&of=id', expected_text="[8]")) def test_search_engine_web_api_for_nonexisting_record(self): """websearch - search engine Web API for non-existing record""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=123456789&of=id', expected_text="[]")) def test_search_engine_web_api_for_nonexisting_collection(self): """websearch - search engine Web API for non-existing collection""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?c=Foo&of=id', expected_text="[]")) def test_search_engine_web_api_for_range_of_records(self): """websearch - search engine Web API for range of records""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=1&recidb=10&of=id', expected_text="[1, 2, 3, 4, 5, 6, 7, 8, 9]")) class WebSearchRestrictedCollectionTest(unittest.TestCase): """Test of the restricted Theses collection behaviour.""" def test_restricted_collection_interface_page(self): """websearch - restricted collection interface page body""" # there should be no Latest additions box for restricted collections self.assertNotEqual([], test_web_page_content(CFG_SITE_URL + '/collection/Theses', expected_text="Latest additions")) def test_restricted_search_as_anonymous_guest(self): """websearch - restricted collection not searchable by anonymous guest""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') response = browser.response().read() if response.find("If you think you have right to access it, please authenticate yourself.") > -1: pass else: self.fail("Oops, searching restricted collection without password should have redirected to login dialog.") return def test_restricted_search_as_authorized_person(self): """websearch - restricted collection searchable by authorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') browser.select_form(nr=0) browser['p_un'] = 'jekyll' browser['p_pw'] = 'j123ekyll' browser.submit() if browser.response().read().find("records found") > -1: pass else: self.fail("Oops, Dr. Jekyll should be able to search Theses collection.") def test_restricted_search_as_unauthorized_person(self): """websearch - restricted collection not searchable by unauthorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') browser.select_form(nr=0) browser['p_un'] = 'hyde' browser['p_pw'] = 'h123yde' browser.submit() # Mr. Hyde should not be able to connect: if browser.response().read().find("Authorization failure") <= -1: # if we got here, things are broken: self.fail("Oops, Mr.Hyde should not be able to search Theses collection.") def test_restricted_detailed_record_page_as_anonymous_guest(self): """websearch - restricted detailed record page not accessible to guests""" browser = Browser() browser.open(CFG_SITE_URL + '/record/35') if browser.response().read().find("You can use your nickname or your email address to login.") > -1: pass else: self.fail("Oops, searching restricted collection without password should have redirected to login dialog.") return def test_restricted_detailed_record_page_as_authorized_person(self): """websearch - restricted detailed record page accessible to authorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/youraccount/login') browser.select_form(nr=0) browser['p_un'] = 'jekyll' browser['p_pw'] = 'j123ekyll' browser.submit() browser.open(CFG_SITE_URL + '/record/35') # Dr. Jekyll should be able to connect # (add the pw to the whole CFG_SITE_URL because we shall be # redirected to '/reordrestricted/'): if browser.response().read().find("A High-performance Video Browsing System") > -1: pass else: self.fail("Oops, Dr. Jekyll should be able to access restricted detailed record page.") def test_restricted_detailed_record_page_as_unauthorized_person(self): """websearch - restricted detailed record page not accessible to unauthorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/youraccount/login') browser.select_form(nr=0) browser['p_un'] = 'hyde' browser['p_pw'] = 'h123yde' browser.submit() browser.open(CFG_SITE_URL + '/record/35') # Mr. Hyde should not be able to connect: if browser.response().read().find('You are not authorized') <= -1: # if we got here, things are broken: self.fail("Oops, Mr.Hyde should not be able to access restricted detailed record page.") def test_collection_restricted_p(self): """websearch - collection_restricted_p""" self.failUnless(collection_restricted_p('Theses'), True) self.failIf(collection_restricted_p('Books & Reports')) def test_get_permitted_restricted_collections(self): """websearch - get_permitted_restricted_collections""" from invenio.webuser import get_uid_from_email, collect_user_info self.assertEqual(get_permitted_restricted_collections(collect_user_info(get_uid_from_email('jekyll@cds.cern.ch'))), ['Theses']) self.assertEqual(get_permitted_restricted_collections(collect_user_info(get_uid_from_email('hyde@cds.cern.ch'))), []) class WebSearchRestrictedPicturesTest(unittest.TestCase): """ Check whether restricted pictures on the demo site can be accessed well by people who have rights to access them. """ def test_restricted_pictures_guest(self): """websearch - restricted pictures not available to guest""" error_messages = test_web_page_content(CFG_SITE_URL + '/record/1/files/0106015_01.jpg', expected_text=['This file is restricted', 'You are not authorized']) if error_messages: self.fail(merge_error_messages(error_messages)) def test_restricted_pictures_romeo(self): """websearch - restricted pictures available to Romeo""" error_messages = test_web_page_content(CFG_SITE_URL + '/record/1/files/0106015_01.jpg', username='romeo', password='r123omeo', expected_text=[], unexpected_text=['This file is restricted', 'You are not authorized']) if error_messages: self.fail(merge_error_messages(error_messages)) def test_restricted_pictures_hyde(self): """websearch - restricted pictures not available to Mr. Hyde""" error_messages = test_web_page_content(CFG_SITE_URL + '/record/1/files/0106015_01.jpg', username='hyde', password='h123yde', expected_text=['This file is restricted', 'You are not authorized']) if error_messages: self.fail(merge_error_messages(error_messages)) class WebSearchRSSFeedServiceTest(unittest.TestCase): """Test of the RSS feed service.""" def test_rss_feed_service(self): """websearch - RSS feed service""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/rss', expected_text='<rss version="2.0"')) class WebSearchXSSVulnerabilityTest(unittest.TestCase): """Test possible XSS vulnerabilities of the search engine.""" def test_xss_in_collection_interface_page(self): """websearch - no XSS vulnerability in collection interface pages""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/?c=%3CSCRIPT%3Ealert%28%22XSS%22%29%3B%3C%2FSCRIPT%3E', expected_text='Collection &lt;SCRIPT&gt;alert("XSS");&lt;/SCRIPT&gt; Not Found')) def test_xss_in_collection_search_page(self): """websearch - no XSS vulnerability in collection search pages""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?c=%3CSCRIPT%3Ealert%28%22XSS%22%29%3B%3C%2FSCRIPT%3E', expected_text='Collection <SCRIPT>alert("XSS");</SCRIPT> Not Found')) def test_xss_in_simple_search(self): """websearch - no XSS vulnerability in simple search""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%3CSCRIPT%3Ealert%28%22XSS%22%29%3B%3C%2FSCRIPT%3E', expected_text='Search term <em><SCRIPT>alert("XSS");</SCRIPT></em> did not match any record.')) def test_xss_in_structured_search(self): """websearch - no XSS vulnerability in structured search""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%3CSCRIPT%3Ealert%28%22XSS%22%29%3B%3C%2FSCRIPT%3E&f=%3CSCRIPT%3Ealert%28%22XSS%22%29%3B%3C%2FSCRIPT%3E', expected_text='No word index is available for <em><SCRIPT>alert("XSS");</SCRIPT></em>.')) def test_xss_in_advanced_search(self): """websearch - no XSS vulnerability in advanced search""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?as=1&p1=ellis&f1=author&op1=a&p2=%3CSCRIPT%3Ealert%28%22XSS%22%29%3B%3C%2FSCRIPT%3E&f2=%3CSCRIPT%3Ealert%28%22XSS%22%29%3B%3C%2FSCRIPT%3E&m2=e', expected_text='Search term <em><SCRIPT>alert("XSS");</SCRIPT></em> inside index <em><SCRIPT>alert("XSS");</SCRIPT></em> did not match any record.')) def test_xss_in_browse(self): """websearch - no XSS vulnerability in browse""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%3CSCRIPT%3Ealert%28%22XSS%22%29%3B%3C%2FSCRIPT%3E&f=%3CSCRIPT%3Ealert%28%22XSS%22%29%3B%3C%2FSCRIPT%3E&action_browse=Browse', expected_text='<SCRIPT>alert("XSS");</SCRIPT>')) class WebSearchResultsOverview(unittest.TestCase): """Test of the search results page's Results overview box and links.""" def test_results_overview_split_off(self): """websearch - results overview box when split by collection is off""" browser = Browser() browser.open(CFG_SITE_URL + '/search?p=of&sc=0') body = browser.response().read() if body.find("Results overview") > -1: self.fail("Oops, when split by collection is off, " "results overview should not be present.") if body.find('<a name="1"></a>') == -1: self.fail("Oops, when split by collection is off, " "Atlantis collection should be found.") if body.find('<a name="15"></a>') > -1: self.fail("Oops, when split by collection is off, " "Multimedia & Arts should not be found.") try: browser.find_link(url='#15') self.fail("Oops, when split by collection is off, " "a link to Multimedia & Arts should not be found.") except LinkNotFoundError: pass def test_results_overview_split_on(self): """websearch - results overview box when split by collection is on""" browser = Browser() browser.open(CFG_SITE_URL + '/search?p=of&sc=1') body = browser.response().read() if body.find("Results overview") == -1: self.fail("Oops, when split by collection is on, " "results overview should be present.") if body.find('<a name="Atlantis%20Institute%20of%20Fictive%20Science"></a>') > -1: self.fail("Oops, when split by collection is on, " "Atlantis collection should not be found.") if body.find('<a name="15"></a>') == -1: self.fail("Oops, when split by collection is on, " "Multimedia & Arts should be found.") try: browser.find_link(url='#15') except LinkNotFoundError: self.fail("Oops, when split by collection is on, " "a link to Multimedia & Arts should be found.") class WebSearchSortResultsTest(unittest.TestCase): """Test of the search results page's sorting capability.""" def test_sort_results_default(self): """websearch - search results sorting, default method""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1', expected_text="[hep-th/9809057]")) def test_sort_results_ascending(self): """websearch - search results sorting, ascending field""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=a', expected_text="ISOLTRAP")) def test_sort_results_descending(self): """websearch - search results sorting, descending field""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=d', expected_text=" [SCAN-9605071]")) def test_sort_results_sort_pattern(self): """websearch - search results sorting, preferential sort pattern""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=d&sp=cern', expected_text="[CERN-TH-2002-069]")) class WebSearchSearchResultsXML(unittest.TestCase): """Test search results in various output""" def test_search_results_xm_output_split_on(self): """ websearch - check document element of search results in xm output (split by collection on)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=1&of=xm') body = browser.response().read() num_doc_element = body.count("<collection " "xmlns=\"http://www.loc.gov/MARC21/slim\">") if num_doc_element == 0: self.fail("Oops, no document element <collection " "xmlns=\"http://www.loc.gov/MARC21/slim\">" "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements <collection> " "found in search results.") num_doc_element = body.count("</collection>") if num_doc_element == 0: self.fail("Oops, no document element </collection> " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements </collection> " "found in search results.") def test_search_results_xm_output_split_off(self): """ websearch - check document element of search results in xm output (split by collection off)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=0&of=xm') body = browser.response().read() num_doc_element = body.count("<collection " "xmlns=\"http://www.loc.gov/MARC21/slim\">") if num_doc_element == 0: self.fail("Oops, no document element <collection " "xmlns=\"http://www.loc.gov/MARC21/slim\">" "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements <collection> " "found in search results.") num_doc_element = body.count("</collection>") if num_doc_element == 0: self.fail("Oops, no document element </collection> " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements </collection> " "found in search results.") def test_search_results_xd_output_split_on(self): """ websearch - check document element of search results in xd output (split by collection on)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=1&of=xd') body = browser.response().read() num_doc_element = body.count("<collection") if num_doc_element == 0: self.fail("Oops, no document element <collection " "xmlns=\"http://www.loc.gov/MARC21/slim\">" "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements <collection> " "found in search results.") num_doc_element = body.count("</collection>") if num_doc_element == 0: self.fail("Oops, no document element </collection> " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements </collection> " "found in search results.") def test_search_results_xd_output_split_off(self): """ websearch - check document element of search results in xd output (split by collection off)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=0&of=xd') body = browser.response().read() num_doc_element = body.count("<collection>") if num_doc_element == 0: self.fail("Oops, no document element <collection " "xmlns=\"http://www.loc.gov/MARC21/slim\">" "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements <collection> " "found in search results.") num_doc_element = body.count("</collection>") if num_doc_element == 0: self.fail("Oops, no document element </collection> " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements </collection> " "found in search results.") class WebSearchUnicodeQueryTest(unittest.TestCase): """Test of the search results for queries containing Unicode characters.""" def test_unicode_word_query(self): """websearch - Unicode word query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%CE%99%CE%B8%CE%AC%CE%BA%CE%B7', expected_text="[76]")) def test_unicode_word_query_not_found_term(self): """websearch - Unicode word query, not found term""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3A%CE%99%CE%B8', expected_text="ιθάκη")) def test_unicode_exact_phrase_query(self): """websearch - Unicode exact phrase query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%22%CE%99%CE%B8%CE%AC%CE%BA%CE%B7%22', expected_text="[76]")) def test_unicode_partial_phrase_query(self): """websearch - Unicode partial phrase query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%27%CE%B7%27', expected_text="[76]")) def test_unicode_regexp_query(self): """websearch - Unicode regexp query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%2F%CE%B7%2F', expected_text="[76]")) class WebSearchMARCQueryTest(unittest.TestCase): """Test of the search results for queries containing physical MARC tags.""" def test_single_marc_tag_exact_phrase_query(self): """websearch - single MARC tag, exact phrase query (100__a)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=100__a%3A%22Ellis%2C+J%22', expected_text="[9, 14, 18]")) def test_single_marc_tag_partial_phrase_query(self): """websearch - single MARC tag, partial phrase query (245__b)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245__b%3A%27and%27', expected_text="[28]")) def test_many_marc_tags_partial_phrase_query(self): """websearch - many MARC tags, partial phrase query (245)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245%3A%27and%27', expected_text="[1, 8, 9, 14, 15, 20, 22, 24, 28, 33, 47, 48, 49, 51, 53, 64, 69, 71, 79, 82, 83, 85, 91, 96]")) def test_single_marc_tag_regexp_query(self): """websearch - single MARC tag, regexp query""" # NOTE: regexp queries for physical MARC tags (e.g. 245:/and/) # are not treated by the search engine by purpose. But maybe # we should support them?! self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245%3A%2Fand%2F', expected_text="[]")) class WebSearchExtSysnoQueryTest(unittest.TestCase): """Test of queries using external system numbers.""" def test_existing_sysno_html_output(self): """websearch - external sysno query, existing sysno, HTML output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CER', expected_text="The wall of the cave")) def test_existing_sysno_id_output(self): """websearch - external sysno query, existing sysno, ID output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CER&of=id', expected_text="[95]")) def test_nonexisting_sysno_html_output(self): """websearch - external sysno query, non-existing sysno, HTML output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CERRRR', expected_text="Requested record does not seem to exist.")) def test_nonexisting_sysno_id_output(self): """websearch - external sysno query, non-existing sysno, ID output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CERRRR&of=id', expected_text="[]")) class WebSearchResultsRecordGroupingTest(unittest.TestCase): """Test search results page record grouping (rg).""" def test_search_results_rg_guest(self): """websearch - search results, records in groups of, guest""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?rg=17', expected_text="1 - 17")) def test_search_results_rg_nonguest(self): """websearch - search results, records in groups of, non-guest""" # This test used to fail due to saved user preference fetching # not overridden by URL rg argument. self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?rg=17', username='admin', expected_text="1 - 17")) class WebSearchSpecialTermsQueryTest(unittest.TestCase): """Test of the search results for queries containing special terms.""" def test_special_terms_u1(self): """websearch - query for special terms, U(1)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29', expected_text="[57, 79, 80, 88]")) def test_special_terms_u1_and_sl(self): """websearch - query for special terms, U(1) SL(2,Z)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29+SL%282%2CZ%29', expected_text="[88]")) def test_special_terms_u1_and_sl_or(self): """websearch - query for special terms, U(1) OR SL(2,Z)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29+OR+SL%282%2CZ%29', expected_text="[57, 79, 80, 88]")) def test_special_terms_u1_and_sl_or_parens(self): """websearch - query for special terms, (U(1) OR SL(2,Z))""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=%28U%281%29+OR+SL%282%2CZ%29%29', expected_text="[57, 79, 80, 88]")) class WebSearchJournalQueryTest(unittest.TestCase): """Test of the search results for journal pubinfo queries.""" def test_query_journal_title_only(self): """websearch - journal publication info query, title only""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&f=journal&p=Phys.+Lett.+B', expected_text="[77, 78, 85, 87]")) def test_query_journal_full_pubinfo(self): """websearch - journal publication info query, full reference""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&f=journal&p=Phys.+Lett.+B+531+%282002%29+301', expected_text="[78]")) class WebSearchStemmedIndexQueryTest(unittest.TestCase): """Test of the search results for queries using stemmed indexes.""" def test_query_stemmed_lowercase(self): """websearch - stemmed index query, lowercase""" # note that dasse/Dasse is stemmed into dass/Dass, as expected self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=dasse', expected_text="[25, 26]")) def test_query_stemmed_uppercase(self): """websearch - stemmed index query, uppercase""" # ... but note also that DASSE is stemmed into DASSE(!); so # the test would fail if the search engine would not lower the # query term. (Something that is not necessary for # non-stemmed indexes.) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=DASSE', expected_text="[25, 26]")) class WebSearchSummarizerTest(unittest.TestCase): """Test of the search results summarizer functions.""" def test_most_popular_field_values_singletag(self): """websearch - most popular field values, simple tag""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('PREPRINT', 36), ('ARTICLE', 27), ('BOOK', 14), ('THESIS', 8), ('PICTURE', 7), ('POETRY', 2), ('REPORT', 2)), get_most_popular_field_values(range(0,100), '980__a')) def test_most_popular_field_values_singletag_multiexclusion(self): """websearch - most popular field values, simple tag, multiple exclusions""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('PREPRINT', 36), ('ARTICLE', 27), ('BOOK', 14), ('REPORT', 2)), get_most_popular_field_values(range(0,100), '980__a', ('THESIS', 'PICTURE', 'POETRY'))) def test_most_popular_field_values_multitag(self): """websearch - most popular field values, multiple tags""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('Ellis, J', 3), ('Enqvist, K', 1), ('Ibanez, L E', 1), ('Nanopoulos, D V', 1), ('Ross, G G', 1)), get_most_popular_field_values((9, 14, 18), ('100__a', '700__a'))) def test_most_popular_field_values_multitag_singleexclusion(self): """websearch - most popular field values, multiple tags, single exclusion""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('Enqvist, K', 1), ('Ibanez, L E', 1), ('Nanopoulos, D V', 1), ('Ross, G G', 1)), get_most_popular_field_values((9, 14, 18), ('100__a', '700__a'), ('Ellis, J'))) def test_most_popular_field_values_multitag_countrepetitive(self): """websearch - most popular field values, multiple tags, counting repetitive occurrences""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('THESIS', 2), ('REPORT', 1)), get_most_popular_field_values((41,), ('690C_a', '980__a'), count_repetitive_values=True)) self.assertEqual((('REPORT', 1), ('THESIS', 1)), get_most_popular_field_values((41,), ('690C_a', '980__a'), count_repetitive_values=False)) def test_ellis_citation_summary(self): """websearch - query ellis, citation summary output format""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis&of=hcs', expected_text="Less known papers (1-9)", expected_link_target=CFG_SITE_URL+"/search?p=ellis%20cited%3A1-%3E9&rm=citation", expected_link_label='1')) class WebSearchRecordCollectionGuessTest(unittest.TestCase): """Primary collection guessing tests.""" def test_guess_primary_collection_of_a_record(self): """websearch - guess_primary_collection_of_a_record""" self.assertEqual(guess_primary_collection_of_a_record(96), 'Articles') def test_guess_collection_of_a_record(self): """websearch - guess_collection_of_a_record""" self.assertEqual(guess_collection_of_a_record(96), 'Articles') self.assertEqual(guess_collection_of_a_record(96, '%s/collection/Theoretical Physics (TH)?ln=en' % CFG_SITE_URL), 'Articles') self.assertEqual(guess_collection_of_a_record(12, '%s/collection/Theoretical Physics (TH)?ln=en' % CFG_SITE_URL), 'Theoretical Physics (TH)') self.assertEqual(guess_collection_of_a_record(12, '%s/collection/Theoretical%%20Physics%%20%%28TH%%29?ln=en' % CFG_SITE_URL), 'Theoretical Physics (TH)') class WebSearchGetFieldValuesTest(unittest.TestCase): """Testing get_fieldvalues() function.""" def test_get_fieldvalues_001(self): """websearch - get_fieldvalues() for bibxxx-agnostic tags""" self.assertEqual(get_fieldvalues(10, '001___'), ['10']) def test_get_fieldvalues_980(self): """websearch - get_fieldvalues() for bibxxx-powered tags""" self.assertEqual(get_fieldvalues(18, '700__a'), ['Enqvist, K', 'Nanopoulos, D V']) self.assertEqual(get_fieldvalues(18, '909C1u'), ['CERN']) def test_get_fieldvalues_wildcard(self): """websearch - get_fieldvalues() for tag wildcards""" self.assertEqual(get_fieldvalues(18, '%'), []) self.assertEqual(get_fieldvalues(18, '7%'), []) self.assertEqual(get_fieldvalues(18, '700%'), ['Enqvist, K', 'Nanopoulos, D V']) self.assertEqual(get_fieldvalues(18, '909C0%'), ['1985', '13','TH']) def test_get_fieldvalues_recIDs(self): """websearch - get_fieldvalues() for list of recIDs""" self.assertEqual(get_fieldvalues([], '001___'), []) self.assertEqual(get_fieldvalues([], '700__a'), []) self.assertEqual(get_fieldvalues([10, 13], '001___'), ['10', '13']) self.assertEqual(get_fieldvalues([18, 13], '700__a'), ['Dawson, S', 'Ellis, R K', 'Enqvist, K', 'Nanopoulos, D V']) def test_get_fieldvalues_repetitive(self): """websearch - get_fieldvalues() for repetitive values""" self.assertEqual(get_fieldvalues([17, 18], '909C1u'), ['CERN', 'CERN']) self.assertEqual(get_fieldvalues([17, 18], '909C1u', repetitive_values=True), ['CERN', 'CERN']) self.assertEqual(get_fieldvalues([17, 18], '909C1u', repetitive_values=False), ['CERN']) TEST_SUITE = make_test_suite(WebSearchWebPagesAvailabilityTest, WebSearchTestSearch, WebSearchTestBrowse, WebSearchTestOpenURL, WebSearchTestCollections, WebSearchTestRecord, WebSearchTestLegacyURLs, WebSearchNearestTermsTest, WebSearchBooleanQueryTest, WebSearchAuthorQueryTest, WebSearchSearchEnginePythonAPITest, WebSearchSearchEngineWebAPITest, WebSearchRestrictedCollectionTest, WebSearchRestrictedPicturesTest, WebSearchRSSFeedServiceTest, WebSearchXSSVulnerabilityTest, WebSearchResultsOverview, WebSearchSortResultsTest, WebSearchSearchResultsXML, WebSearchUnicodeQueryTest, WebSearchMARCQueryTest, WebSearchExtSysnoQueryTest, WebSearchResultsRecordGroupingTest, WebSearchSpecialTermsQueryTest, WebSearchJournalQueryTest, WebSearchStemmedIndexQueryTest, WebSearchSummarizerTest, WebSearchRecordCollectionGuessTest, WebSearchGetFieldValuesTest) if __name__ == "__main__": run_test_suite(TEST_SUITE, warn_user=True) diff --git a/modules/websearch/lib/websearch_templates.py b/modules/websearch/lib/websearch_templates.py index d6044679b..dc7860da2 100644 --- a/modules/websearch/lib/websearch_templates.py +++ b/modules/websearch/lib/websearch_templates.py @@ -1,3623 +1,3642 @@ # -*- coding: utf-8 -*- ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable-msg=C0301 __revision__ = "$Id$" import time import cgi import gettext import string import re import locale from urllib import quote, urlencode from xml.sax.saxutils import escape as xml_escape from invenio.config import \ CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH, \ CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH, \ CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH, \ CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD, \ CFG_WEBSEARCH_USE_ALEPH_SYSNOS, \ CFG_WEBSEARCH_SPLIT_BY_COLLECTION, \ CFG_BIBRANK_SHOW_READING_STATS, \ CFG_BIBRANK_SHOW_DOWNLOAD_STATS, \ CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS, \ CFG_BIBRANK_SHOW_CITATION_LINKS, \ CFG_BIBRANK_SHOW_CITATION_STATS, \ CFG_BIBRANK_SHOW_CITATION_GRAPHS, \ CFG_WEBSEARCH_INSTANT_BROWSE_RSS, \ CFG_WEBSEARCH_RSS_TTL, \ CFG_SITE_LANG, \ CFG_SITE_NAME, \ CFG_SITE_NAME_INTL, \ CFG_VERSION, \ CFG_SITE_URL, \ CFG_SITE_SUPPORT_EMAIL, \ CFG_INSPIRE_SITE, \ CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, \ CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES, \ CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS, \ CFG_BIBINDEX_CHARS_PUNCTUATION from invenio.dbquery import run_sql from invenio.messages import gettext_set_language #from invenio.search_engine_config import CFG_EXPERIMENTAL_FEATURES from invenio.urlutils import make_canonical_urlargd, drop_default_urlargd, create_html_link, create_url from invenio.htmlutils import nmtoken_from_string from invenio.webinterface_handler import wash_urlargd from invenio.bibrank_citation_searcher import get_cited_by_count from invenio.intbitset import intbitset from invenio.websearch_external_collections import external_collection_get_state _RE_PUNCTUATION = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION) _RE_SPACES = re.compile(r"\s+") def get_fieldvalues(recID, tag): """Return list of field values for field TAG inside record RECID. FIXME: should be imported commonly for search_engine too.""" out = [] if tag == "001___": # we have asked for recID that is not stored in bibXXx tables out.append(str(recID)) else: # we are going to look inside bibXXx tables digit = tag[0:2] bx = "bib%sx" % digit bibx = "bibrec_bib%sx" % digit query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag LIKE '%s'" \ "ORDER BY bibx.field_number, bx.tag ASC" % (bx, bibx, recID, tag) res = run_sql(query) for row in res: out.append(row[0]) return out class Template: # This dictionary maps CDS Invenio language code to locale codes (ISO 639) tmpl_localemap = { 'bg': 'bg_BG', 'ca': 'ca_ES', 'de': 'de_DE', 'el': 'el_GR', 'en': 'en_US', 'es': 'es_ES', 'pt': 'pt_BR', 'fr': 'fr_FR', 'it': 'it_IT', 'ru': 'ru_RU', 'sk': 'sk_SK', 'cs': 'cs_CZ', 'no': 'no_NO', 'sv': 'sv_SE', 'uk': 'uk_UA', 'ja': 'ja_JA', 'pl': 'pl_PL', 'hr': 'hr_HR', 'zh_CN': 'zh_CN', 'zh_TW': 'zh_TW', 'hu': 'hu_HU', 'af': 'af_ZA', 'gl': 'gl_ES' } tmpl_default_locale = "en_US" # which locale to use by default, useful in case of failure # Type of the allowed parameters for the web interface for search results search_results_default_urlargd = { 'cc': (str, CFG_SITE_NAME), 'c': (list, []), 'p': (str, ""), 'f': (str, ""), 'rg': (int, 10), 'sf': (str, ""), 'so': (str, "d"), 'sp': (str, ""), 'rm': (str, ""), 'of': (str, "hb"), 'ot': (list, []), + 'aas': (int, CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE), 'as': (int, CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE), 'p1': (str, ""), 'f1': (str, ""), 'm1': (str, ""), 'op1':(str, ""), 'p2': (str, ""), 'f2': (str, ""), 'm2': (str, ""), 'op2':(str, ""), 'p3': (str, ""), 'f3': (str, ""), 'm3': (str, ""), 'sc': (int, 0), 'jrec': (int, 0), 'recid': (int, -1), 'recidb': (int, -1), 'sysno': (str, ""), 'id': (int, -1), 'idb': (int, -1), 'sysnb': (str, ""), 'action': (str, "search"), 'action_search': (str, ""), 'action_browse': (str, ""), 'd1': (str, ""), 'd1y': (int, 0), 'd1m': (int, 0), 'd1d': (int, 0), 'd2': (str, ""), 'd2y': (int, 0), 'd2m': (int, 0), 'd2d': (int, 0), 'dt': (str, ""), 'ap': (int, 1), 'verbose': (int, 0), 'ec': (list, []), } # ...and for search interfaces search_interface_default_urlargd = { + 'aas': (int, CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE), 'as': (int, CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE), 'verbose': (int, 0)} # ...and for RSS feeds rss_default_urlargd = {'c' : (list, []), 'cc' : (str, ""), 'p' : (str, ""), 'f' : (str, ""), 'p1' : (str, ""), 'f1' : (str, ""), 'm1' : (str, ""), 'op1': (str, ""), 'p2' : (str, ""), 'f2' : (str, ""), 'm2' : (str, ""), 'op2': (str, ""), 'p3' : (str, ""), 'f3' : (str, ""), 'm3' : (str, "")} tmpl_openurl_accepted_args = { 'id' : (list, []), 'genre' : (str, ''), 'aulast' : (str, ''), 'aufirst' : (str, ''), 'auinit' : (str, ''), 'auinit1' : (str, ''), 'auinitm' : (str, ''), 'issn' : (str, ''), 'eissn' : (str, ''), 'coden' : (str, ''), 'isbn' : (str, ''), 'sici' : (str, ''), 'bici' : (str, ''), 'title' : (str, ''), 'stitle' : (str, ''), 'atitle' : (str, ''), 'volume' : (str, ''), 'part' : (str, ''), 'issue' : (str, ''), 'spage' : (str, ''), 'epage' : (str, ''), 'pages' : (str, ''), 'artnum' : (str, ''), 'date' : (str, ''), 'ssn' : (str, ''), 'quarter' : (str, ''), 'url_ver' : (str, ''), 'ctx_ver' : (str, ''), 'rft_val_fmt' : (str, ''), 'rft_id' : (list, []), 'rft.atitle' : (str, ''), 'rft.title' : (str, ''), 'rft.jtitle' : (str, ''), 'rft.stitle' : (str, ''), 'rft.date' : (str, ''), 'rft.volume' : (str, ''), 'rft.issue' : (str, ''), 'rft.spage' : (str, ''), 'rft.epage' : (str, ''), 'rft.pages' : (str, ''), 'rft.artnumber' : (str, ''), 'rft.issn' : (str, ''), 'rft.eissn' : (str, ''), 'rft.aulast' : (str, ''), 'rft.aufirst' : (str, ''), 'rft.auinit' : (str, ''), 'rft.auinit1' : (str, ''), 'rft.auinitm' : (str, ''), 'rft.ausuffix' : (str, ''), 'rft.au' : (list, []), 'rft.aucorp' : (str, ''), 'rft.isbn' : (str, ''), 'rft.coden' : (str, ''), 'rft.sici' : (str, ''), 'rft.genre' : (str, 'unknown'), 'rft.chron' : (str, ''), 'rft.ssn' : (str, ''), 'rft.quarter' : (int, ''), 'rft.part' : (str, ''), 'rft.btitle' : (str, ''), 'rft.isbn' : (str, ''), 'rft.atitle' : (str, ''), 'rft.place' : (str, ''), 'rft.pub' : (str, ''), 'rft.edition' : (str, ''), 'rft.tpages' : (str, ''), 'rft.series' : (str, ''), } def tmpl_openurl2invenio(self, openurl_data): """ Return an Invenio url corresponding to a search with the data included in the openurl form map. """ def isbn_to_isbn13_isbn10(isbn): isbn = isbn.replace(' ', '').replace('-', '') if len(isbn) == 10 and isbn.isdigit(): ## We already have isbn10 return ('', isbn) if len(isbn) != 13 and isbn.isdigit(): return ('', '') isbn13, isbn10 = isbn, isbn[3:-1] checksum = 0 weight = 10 for char in isbn10: checksum += int(char) * weight weight -= 1 checksum = 11 - (checksum % 11) if checksum == 10: isbn10 += 'X' if checksum == 11: isbn10 += '0' else: isbn10 += str(checksum) return (isbn13, isbn10) from invenio.search_engine import perform_request_search doi = '' pmid = '' bibcode = '' oai = '' issn = '' isbn = '' for elem in openurl_data['id']: if elem.startswith('doi:'): doi = elem[len('doi:'):] elif elem.startswith('pmid:'): pmid = elem[len('pmid:'):] elif elem.startswith('bibcode:'): bibcode = elem[len('bibcode:'):] elif elem.startswith('oai:'): oai = elem[len('oai:'):] for elem in openurl_data['rft_id']: if elem.startswith('info:doi/'): doi = elem[len('info:doi/'):] elif elem.startswith('info:pmid/'): pmid = elem[len('info:pmid/'):] elif elem.startswith('info:bibcode/'): bibcode = elem[len('info:bibcode/'):] elif elem.startswith('info:oai/'): oai = elem[len('info:oai/')] elif elem.startswith('urn:ISBN:'): isbn = elem[len('urn:ISBN:'):] elif elem.startswith('urn:ISSN:'): issn = elem[len('urn:ISSN:'):] ## Building author query aulast = openurl_data['rft.aulast'] or openurl_data['aulast'] aufirst = openurl_data['rft.aufirst'] or openurl_data['aufirst'] auinit = openurl_data['rft.auinit'] or \ openurl_data['auinit'] or \ openurl_data['rft.auinit1'] + ' ' + openurl_data['rft.auinitm'] or \ openurl_data['auinit1'] + ' ' + openurl_data['auinitm'] or aufirst[:1] auinit = auinit.upper() if aulast and aufirst: author_query = 'author:"%s, %s" or author:"%s, %s"' % (aulast, aufirst, aulast, auinit) elif aulast and auinit: author_query = 'author:"%s, %s"' % (aulast, auinit) else: author_query = '' ## Building title query title = openurl_data['rft.atitle'] or \ openurl_data['atitle'] or \ openurl_data['rft.btitle'] or \ openurl_data['rft.title'] or \ openurl_data['title'] if title: title_query = 'title:"%s"' % title title_query_cleaned = 'title:"%s"' % _RE_SPACES.sub(' ', _RE_PUNCTUATION.sub(' ', title)) else: title_query = '' ## Building journal query jtitle = openurl_data['rft.stitle'] or \ openurl_data['stitle'] or \ openurl_data['rft.jtitle'] or \ openurl_data['title'] if jtitle: journal_query = 'journal:"%s"' % jtitle else: journal_query = '' ## Building isbn query isbn = isbn or openurl_data['rft.isbn'] or \ openurl_data['isbn'] isbn13, isbn10 = isbn_to_isbn13_isbn10(isbn) if isbn13: isbn_query = 'isbn:"%s" or isbn:"%s"' % (isbn13, isbn10) elif isbn10: isbn_query = 'isbn:"%s"' % isbn10 else: isbn_query = '' ## Building issn query issn = issn or openurl_data['rft.eissn'] or \ openurl_data['eissn'] or \ openurl_data['rft.issn'] or \ openurl_data['issn'] if issn: issn_query = 'issn:"%s"' % issn else: issn_query = '' ## Building coden query coden = openurl_data['rft.coden'] or openurl_data['coden'] if coden: coden_query = 'coden:"%s"' % coden else: coden_query = '' ## Building doi query if False: #doi: #FIXME Temporaly disabled until doi field is properly setup doi_query = 'doi:"%s"' % doi else: doi_query = '' ## Trying possible searches if doi_query: if perform_request_search(p=doi_query): return '%s/search?%s' % (CFG_SITE_URL, urlencode({ 'p' : doi_query, 'sc' : CFG_WEBSEARCH_SPLIT_BY_COLLECTION, 'of' : 'hd'})) if isbn_query: if perform_request_search(p=isbn_query): return '%s/search?%s' % (CFG_SITE_URL, urlencode({ 'p' : isbn_query, 'sc' : CFG_WEBSEARCH_SPLIT_BY_COLLECTION, 'of' : 'hd'})) if coden_query: if perform_request_search(p=coden_query): return '%s/search?%s' % (CFG_SITE_URL, urlencode({ 'p' : coden_query, 'sc' : CFG_WEBSEARCH_SPLIT_BY_COLLECTION, 'of' : 'hd'})) if author_query and title_query: if perform_request_search(p='%s and %s' % (title_query, author_query)): return '%s/search?%s' % (CFG_SITE_URL, urlencode({ 'p' : '%s and %s' % (title_query, author_query), 'sc' : CFG_WEBSEARCH_SPLIT_BY_COLLECTION, 'of' : 'hd'})) if title_query: result = len(perform_request_search(p=title_query)) if result == 1: return '%s/search?%s' % (CFG_SITE_URL, urlencode({ 'p' : title_query, 'sc' : CFG_WEBSEARCH_SPLIT_BY_COLLECTION, 'of' : 'hd'})) elif result > 1: return '%s/search?%s' % (CFG_SITE_URL, urlencode({ 'p' : title_query, 'sc' : CFG_WEBSEARCH_SPLIT_BY_COLLECTION, 'of' : 'hb'})) ## Nothing worked, let's return a search that the user can improve if author_query and title_query: return '%s/search%s' % (CFG_SITE_URL, make_canonical_urlargd({ 'p' : '%s and %s' % (title_query_cleaned, author_query), 'sc' : CFG_WEBSEARCH_SPLIT_BY_COLLECTION, 'of' : 'hb'}, {})) elif title_query: return '%s/search%s' % (CFG_SITE_URL, make_canonical_urlargd({ 'p' : title_query_cleaned, 'sc' : CFG_WEBSEARCH_SPLIT_BY_COLLECTION, 'of' : 'hb'}, {})) else: ## Mmh. Too few information provided. return '%s/search%s' % (CFG_SITE_URL, make_canonical_urlargd({ 'p' : 'recid:-1', 'sc' : CFG_WEBSEARCH_SPLIT_BY_COLLECTION, 'of' : 'hb'}, {})) def build_search_url(self, known_parameters={}, **kargs): """ Helper for generating a canonical search url. 'known_parameters' is the list of query parameters you inherit from your current query. You can then pass keyword arguments to modify this query. build_search_url(known_parameters, of="xm") The generated URL is absolute. """ parameters = {} parameters.update(known_parameters) parameters.update(kargs) # Now, we only have the arguments which have _not_ their default value parameters = drop_default_urlargd(parameters, self.search_results_default_urlargd) + # Treat `as' argument specially: + if parameters.has_key('aas'): + parameters['as'] = parameters['aas'] + del parameters['aas'] + # Asking for a recid? Return a /record/<recid> URL if 'recid' in parameters: target = "%s/record/%s" % (CFG_SITE_URL, parameters['recid']) del parameters['recid'] target += make_canonical_urlargd(parameters, self.search_results_default_urlargd) return target return "%s/search%s" % (CFG_SITE_URL, make_canonical_urlargd(parameters, self.search_results_default_urlargd)) def build_search_interface_url(self, known_parameters={}, **kargs): """ Helper for generating a canonical search interface URL.""" parameters = {} parameters.update(known_parameters) parameters.update(kargs) c = parameters['c'] del parameters['c'] # Now, we only have the arguments which have _not_ their default value + parameters = drop_default_urlargd(parameters, self.search_results_default_urlargd) + + # Treat `as' argument specially: + if parameters.has_key('aas'): + parameters['as'] = parameters['aas'] + del parameters['aas'] + if c and c != CFG_SITE_NAME: base = CFG_SITE_URL + '/collection/' + quote(c) else: base = CFG_SITE_URL - return create_url(base, drop_default_urlargd(parameters, self.search_results_default_urlargd)) + return create_url(base, parameters) def build_rss_url(self, known_parameters, **kargs): """Helper for generating a canonical RSS URL""" parameters = {} parameters.update(known_parameters) parameters.update(kargs) # Keep only interesting parameters argd = wash_urlargd(parameters, self.rss_default_urlargd) if argd: # Handle 'c' differently since it is a list c = argd.get('c', []) del argd['c'] # Create query, and drop empty params args = make_canonical_urlargd(argd, self.rss_default_urlargd) if c != []: # Add collections c = [quote(coll) for coll in c] if args == '': args += '?' else: args += '&' args += 'c=' + '&c='.join(c) return CFG_SITE_URL + '/rss' + args def tmpl_record_page_header_content(self, req, recid, ln): """ Provide extra information in the header of /record pages """ _ = gettext_set_language(ln) title = get_fieldvalues(recid, "245__a") if title: title = cgi.escape(title[0]) else: title = _("Record") + ' #%d' % recid keywords = ', '.join(get_fieldvalues(recid, "6531_a")) description = ' '.join(get_fieldvalues(recid, "520__a")) description += "\n" description += '; '.join(get_fieldvalues(recid, "100__a") + get_fieldvalues(recid, "700__a")) return [cgi.escape(x, True) for x in (title, description, keywords)] - def tmpl_navtrail_links(self, as, ln, dads): + def tmpl_navtrail_links(self, aas, ln, dads): """ Creates the navigation bar at top of each search page (*Home > Root collection > subcollection > ...*) Parameters: - - 'as' *int* - Should we display an advanced search box? + - 'aas' *int* - Should we display an advanced search box? - 'ln' *string* - The language to display - 'separator' *string* - The separator between two consecutive collections - 'dads' *list* - A list of parent links, eachone being a dictionary of ('name', 'longname') """ out = [] for url, name in dads: - out.append(create_html_link(self.build_search_interface_url(c=url, as=as, ln=ln), {}, cgi.escape(name), {'class': 'navtrail'})) + args = {'c': url, 'as': aas, 'ln': ln} + out.append(create_html_link(self.build_search_interface_url(**args), {}, cgi.escape(name), {'class': 'navtrail'})) return ' > '.join(out) def tmpl_webcoll_body(self, ln, collection, te_portalbox, searchfor, np_portalbox, narrowsearch, focuson, instantbrowse, ne_portalbox): """ Creates the body of the main search page. Parameters: - 'ln' *string* - language of the page being generated - 'collection' - collection id of the page being generated - 'te_portalbox' *string* - The HTML code for the portalbox on top of search - 'searchfor' *string* - The HTML code for the search for box - 'np_portalbox' *string* - The HTML code for the portalbox on bottom of search - 'narrowsearch' *string* - The HTML code for the search categories (left bottom of page) - 'focuson' *string* - The HTML code for the "focuson" categories (right bottom of page) - 'ne_portalbox' *string* - The HTML code for the bottom of the page """ if not narrowsearch: narrowsearch = instantbrowse body = ''' <form name="search" action="%(siteurl)s/search" method="get"> %(searchfor)s %(np_portalbox)s <table cellspacing="0" cellpadding="0" border="0"> <tr> <td valign="top">%(narrowsearch)s</td> ''' % { 'siteurl' : CFG_SITE_URL, 'searchfor' : searchfor, 'np_portalbox' : np_portalbox, 'narrowsearch' : narrowsearch } if focuson: body += """<td valign="top">""" + focuson + """</td>""" body += """</tr></table> %(ne_portalbox)s </form>""" % {'ne_portalbox' : ne_portalbox} return body def tmpl_portalbox(self, title, body): """Creates portalboxes based on the parameters Parameters: - 'title' *string* - The title of the box - 'body' *string* - The HTML code for the body of the box """ out = """<div class="portalbox"> <div class="portalboxheader">%(title)s</div> <div class="portalboxbody">%(body)s</div> </div>""" % {'title' : cgi.escape(title), 'body' : body} return out def tmpl_searchfor_light(self, ln, collection_id, collection_name, record_count, example_search_queries): # EXPERIMENTAL """Produces light *Search for* box for the current collection. Parameters: - 'ln' *string* - *str* The language to display - 'collection_id' - *str* The collection id - 'collection_name' - *str* The collection name in current language - 'example_search_queries' - *list* List of search queries given as example for this collection """ # load the right message language _ = gettext_set_language(ln) out = ''' <!--create_searchfor_light()--> ''' argd = drop_default_urlargd({'ln': ln, 'sc': CFG_WEBSEARCH_SPLIT_BY_COLLECTION}, self.search_results_default_urlargd) # Only add non-default hidden values for field, value in argd.items(): out += self.tmpl_input_hidden(field, value) header = _("Search %s records for:") % \ self.tmpl_nbrecs_info(record_count, "", "") asearchurl = self.build_search_interface_url(c=collection_id, - as=max(CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES), + aas=max(CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES), ln=ln) # Build example of queries for this collection example_search_queries_links = [create_html_link(self.build_search_url(p=example_query, ln=ln, - as=-1, + aas=-1, c=collection_id), {}, cgi.escape(example_query), {'class': 'examplequery'}) \ for example_query in example_search_queries] example_query_html = '' if len(example_search_queries) > 0: example_query_link = example_search_queries_links[0] # offers more examples if possible more = '' if len(example_search_queries_links) > 1: more = ''' <script type="text/javascript"> function toggle_more_example_queries_visibility(){ var more = document.getElementById('more_example_queries'); var link = document.getElementById('link_example_queries'); var sep = document.getElementById('more_example_sep'); if (more.style.display=='none'){ more.style.display = ''; link.innerHTML = "%(show_less)s" link.style.color = "rgb(204,0,0)"; sep.style.display = 'none'; } else { more.style.display = 'none'; link.innerHTML = "%(show_more)s" link.style.color = "rgb(0,0,204)"; sep.style.display = ''; } return false; } </script> <span id="more_example_queries" style="display:none;text-align:right"><br/>%(more_example_queries)s<br/></span> <a id="link_example_queries" href="#" onclick="toggle_more_example_queries_visibility()" style="display:none"></a> <script type="text/javascript"> var link = document.getElementById('link_example_queries'); var sep = document.getElementById('more_example_sep'); link.style.display = ''; link.innerHTML = "%(show_more)s"; sep.style.display = ''; </script> ''' % {'more_example_queries': '<br/>'.join(example_search_queries_links[1:]), 'show_less':_("less"), 'show_more':_("more")} example_query_html += '''<p style="text-align:right;margin:0px;"> %(example)s<span id="more_example_sep" style="display:none;"> :: </span>%(more)s </p> ''' % {'example': _("Example: %(x_sample_search_query)s") % \ {'x_sample_search_query': example_query_link}, 'more': more} # display options to search in current collection or everywhere search_in = '' if collection_name != CFG_SITE_NAME_INTL.get(ln, CFG_SITE_NAME): search_in += ''' <input type="radio" name="cc" value="%(collection_id)s" id="searchCollection" checked="checked"/> <label for="searchCollection">%(search_in_collection_name)s</label> <input type="radio" name="cc" value="%(root_collection_name)s" id="searchEverywhere" /> <label for="searchEverywhere">%(search_everywhere)s</label> ''' % {'search_in_collection_name': _("Search in %(x_collection_name)s") % \ {'x_collection_name': collection_name}, 'collection_id': collection_id, 'root_collection_name': CFG_SITE_NAME, 'search_everywhere': _("Search everywhere")} # print commentary start: out += ''' <table> <tbody> <tr valign="baseline"> <td class="searchboxbody" align="right"><input type="text" name="p" size="%(sizepattern)d" value="" /><br/> <small><small>%(example_query_html)s</small></small> </td> <td class="searchboxbody" align="left"> <input class="formbutton" type="submit" name="action_search" value="%(msg_search)s" /> </td> <td class="searchboxbody" align="left" rowspan="2" valign="top"> <small><small> <a href="%(siteurl)s/help/search-tips%(langlink)s">%(msg_search_tips)s</a><br/> %(asearch)s </small></small> </td> </tr></table> <!--<tr valign="baseline"> <td class="searchboxbody" colspan="2" align="left"> <small> --><small>%(search_in)s</small><!-- </small> </td> </tr> </tbody> </table>--> <!--/create_searchfor_light()--> ''' % {'ln' : ln, 'sizepattern' : CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH, 'langlink': ln != CFG_SITE_LANG and '?ln=' + ln or '', 'siteurl' : CFG_SITE_URL, 'asearch' : create_html_link(asearchurl, {}, _('Advanced Search')), 'header' : header, 'msg_search' : _('Search'), 'msg_browse' : _('Browse'), 'msg_search_tips' : _('Search Tips'), 'search_in': search_in, 'example_query_html': example_query_html} return out def tmpl_searchfor_simple(self, ln, collection_id, collection_name, record_count, middle_option): """Produces simple *Search for* box for the current collection. Parameters: - 'ln' *string* - *str* The language to display - 'collection_id' - *str* The collection id - 'collection_name' - *str* The collection name in current language - 'record_count' - *str* Number of records in this collection - 'middle_option' *string* - HTML code for the options (any field, specific fields ...) """ # load the right message language _ = gettext_set_language(ln) out = ''' <!--create_searchfor_simple()--> ''' argd = drop_default_urlargd({'ln': ln, 'cc': collection_id, 'sc': CFG_WEBSEARCH_SPLIT_BY_COLLECTION}, self.search_results_default_urlargd) # Only add non-default hidden values for field, value in argd.items(): out += self.tmpl_input_hidden(field, value) header = _("Search %s records for:") % \ self.tmpl_nbrecs_info(record_count, "", "") asearchurl = self.build_search_interface_url(c=collection_id, - as=max(CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES), + aas=max(CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES), ln=ln) # print commentary start: out += ''' <table class="searchbox"> <thead> <tr align="left"> <th colspan="3" class="searchboxheader">%(header)s</th> </tr> </thead> <tbody> <tr valign="baseline"> <td class="searchboxbody" align="left"><input type="text" name="p" size="%(sizepattern)d" value="" /></td> <td class="searchboxbody" align="left">%(middle_option)s</td> <td class="searchboxbody" align="left"> <input class="formbutton" type="submit" name="action_search" value="%(msg_search)s" /> <input class="formbutton" type="submit" name="action_browse" value="%(msg_browse)s" /></td> </tr> <tr valign="baseline"> <td class="searchboxbody" colspan="3" align="right"> <small> <a href="%(siteurl)s/help/search-tips%(langlink)s">%(msg_search_tips)s</a> :: %(asearch)s </small> </td> </tr> </tbody> </table> <!--/create_searchfor_simple()--> ''' % {'ln' : ln, 'sizepattern' : CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH, 'langlink': ln != CFG_SITE_LANG and '?ln=' + ln or '', 'siteurl' : CFG_SITE_URL, 'asearch' : create_html_link(asearchurl, {}, _('Advanced Search')), 'header' : header, 'middle_option' : middle_option, 'msg_search' : _('Search'), 'msg_browse' : _('Browse'), 'msg_search_tips' : _('Search Tips')} return out def tmpl_searchfor_advanced(self, ln, # current language collection_id, collection_name, record_count, middle_option_1, middle_option_2, middle_option_3, searchoptions, sortoptions, rankoptions, displayoptions, formatoptions ): """ Produces advanced *Search for* box for the current collection. Parameters: - 'ln' *string* - The language to display - 'middle_option_1' *string* - HTML code for the first row of options (any field, specific fields ...) - 'middle_option_2' *string* - HTML code for the second row of options (any field, specific fields ...) - 'middle_option_3' *string* - HTML code for the third row of options (any field, specific fields ...) - 'searchoptions' *string* - HTML code for the search options - 'sortoptions' *string* - HTML code for the sort options - 'rankoptions' *string* - HTML code for the rank options - 'displayoptions' *string* - HTML code for the display options - 'formatoptions' *string* - HTML code for the format options """ # load the right message language _ = gettext_set_language(ln) out = ''' <!--create_searchfor_advanced()--> ''' - argd = drop_default_urlargd({'ln': ln, 'as': 1, 'cc': collection_id, 'sc': CFG_WEBSEARCH_SPLIT_BY_COLLECTION}, + argd = drop_default_urlargd({'ln': ln, 'aas': 1, 'cc': collection_id, 'sc': CFG_WEBSEARCH_SPLIT_BY_COLLECTION}, self.search_results_default_urlargd) # Only add non-default hidden values for field, value in argd.items(): out += self.tmpl_input_hidden(field, value) header = _("Search %s records for") % \ self.tmpl_nbrecs_info(record_count, "", "") header += ':' - ssearchurl = self.build_search_interface_url(c=collection_id, as=min(CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES), ln=ln) + ssearchurl = self.build_search_interface_url(c=collection_id, aas=min(CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES), ln=ln) out += ''' <table class="searchbox"> <thead> <tr> <th class="searchboxheader" colspan="3">%(header)s</th> </tr> </thead> <tbody> <tr valign="bottom"> <td class="searchboxbody" style="white-space: nowrap;"> %(matchbox_m1)s<input type="text" name="p1" size="%(sizepattern)d" value="" /> </td> <td class="searchboxbody" style="white-space: nowrap;">%(middle_option_1)s</td> <td class="searchboxbody">%(andornot_op1)s</td> </tr> <tr valign="bottom"> <td class="searchboxbody" style="white-space: nowrap;"> %(matchbox_m2)s<input type="text" name="p2" size="%(sizepattern)d" value="" /> </td> <td class="searchboxbody">%(middle_option_2)s</td> <td class="searchboxbody">%(andornot_op2)s</td> </tr> <tr valign="bottom"> <td class="searchboxbody" style="white-space: nowrap;"> %(matchbox_m3)s<input type="text" name="p3" size="%(sizepattern)d" value="" /> </td> <td class="searchboxbody">%(middle_option_3)s</td> <td class="searchboxbody" style="white-space: nowrap;"> <input class="formbutton" type="submit" name="action_search" value="%(msg_search)s" /> <input class="formbutton" type="submit" name="action_browse" value="%(msg_browse)s" /></td> </tr> <tr valign="bottom"> <td colspan="3" class="searchboxbody" align="right"> <small> <a href="%(siteurl)s/help/search-tips%(langlink)s">%(msg_search_tips)s</a> :: %(ssearch)s </small> </td> </tr> </tbody> </table> <!-- @todo - more imports --> ''' % {'ln' : ln, 'langlink': ln != CFG_SITE_LANG and '?ln=' + ln or '', 'siteurl' : CFG_SITE_URL, 'ssearch' : create_html_link(ssearchurl, {}, _("Simple Search")), 'header' : header, 'sizepattern' : CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH, 'matchbox_m1' : self.tmpl_matchtype_box('m1', ln=ln), 'middle_option_1' : middle_option_1, 'andornot_op1' : self.tmpl_andornot_box('op1', ln=ln), 'matchbox_m2' : self.tmpl_matchtype_box('m2', ln=ln), 'middle_option_2' : middle_option_2, 'andornot_op2' : self.tmpl_andornot_box('op2', ln=ln), 'matchbox_m3' : self.tmpl_matchtype_box('m3', ln=ln), 'middle_option_3' : middle_option_3, 'msg_search' : _("Search"), 'msg_browse' : _("Browse"), 'msg_search_tips' : _("Search Tips")} if (searchoptions): out += """<table class="searchbox"> <thead> <tr> <th class="searchboxheader"> %(searchheader)s </th> </tr> </thead> <tbody> <tr valign="bottom"> <td class="searchboxbody">%(searchoptions)s</td> </tr> </tbody> </table>""" % { 'searchheader' : _("Search options:"), 'searchoptions' : searchoptions } out += """<table class="searchbox"> <thead> <tr> <th class="searchboxheader"> %(added)s </th> <th class="searchboxheader"> %(until)s </th> </tr> </thead> <tbody> <tr valign="bottom"> <td class="searchboxbody">%(added_or_modified)s %(date_added)s</td> <td class="searchboxbody">%(date_until)s</td> </tr> </tbody> </table> <table class="searchbox"> <thead> <tr> <th class="searchboxheader"> %(msg_sort)s </th> <th class="searchboxheader"> %(msg_display)s </th> <th class="searchboxheader"> %(msg_format)s </th> </tr> </thead> <tbody> <tr valign="bottom"> <td class="searchboxbody">%(sortoptions)s %(rankoptions)s</td> <td class="searchboxbody">%(displayoptions)s</td> <td class="searchboxbody">%(formatoptions)s</td> </tr> </tbody> </table> <!--/create_searchfor_advanced()--> """ % { 'added' : _("Added/modified since:"), 'until' : _("until:"), 'added_or_modified': self.tmpl_inputdatetype(ln=ln), 'date_added' : self.tmpl_inputdate("d1", ln=ln), 'date_until' : self.tmpl_inputdate("d2", ln=ln), 'msg_sort' : _("Sort by:"), 'msg_display' : _("Display results:"), 'msg_format' : _("Output format:"), 'sortoptions' : sortoptions, 'rankoptions' : rankoptions, 'displayoptions' : displayoptions, 'formatoptions' : formatoptions } return out def tmpl_matchtype_box(self, name='m', value='', ln='en'): """Returns HTML code for the 'match type' selection box. Parameters: - 'name' *string* - The name of the produced select - 'value' *string* - The selected value (if any value is already selected) - 'ln' *string* - the language to display """ # load the right message language _ = gettext_set_language(ln) out = """ <select name="%(name)s"> <option value="a"%(sela)s>%(opta)s</option> <option value="o"%(selo)s>%(opto)s</option> <option value="e"%(sele)s>%(opte)s</option> <option value="p"%(selp)s>%(optp)s</option> <option value="r"%(selr)s>%(optr)s</option> </select> """ % {'name' : name, 'sela' : self.tmpl_is_selected('a', value), 'opta' : _("All of the words:"), 'selo' : self.tmpl_is_selected('o', value), 'opto' : _("Any of the words:"), 'sele' : self.tmpl_is_selected('e', value), 'opte' : _("Exact phrase:"), 'selp' : self.tmpl_is_selected('p', value), 'optp' : _("Partial phrase:"), 'selr' : self.tmpl_is_selected('r', value), 'optr' : _("Regular expression:") } return out def tmpl_is_selected(self, var, fld): """ Checks if *var* and *fld* are equal, and if yes, returns ' selected="selected"'. Useful for select boxes. Parameters: - 'var' *string* - First value to compare - 'fld' *string* - Second value to compare """ if var == fld: return ' selected="selected"' else: return "" def tmpl_andornot_box(self, name='op', value='', ln='en'): """ Returns HTML code for the AND/OR/NOT selection box. Parameters: - 'name' *string* - The name of the produced select - 'value' *string* - The selected value (if any value is already selected) - 'ln' *string* - the language to display """ # load the right message language _ = gettext_set_language(ln) out = """ <select name="%(name)s"> <option value="a"%(sela)s>%(opta)s</option> <option value="o"%(selo)s>%(opto)s</option> <option value="n"%(seln)s>%(optn)s</option> </select> """ % {'name' : name, 'sela' : self.tmpl_is_selected('a', value), 'opta' : _("AND"), 'selo' : self.tmpl_is_selected('o', value), 'opto' : _("OR"), 'seln' : self.tmpl_is_selected('n', value), 'optn' : _("AND NOT") } return out def tmpl_inputdate(self, name, ln, sy = 0, sm = 0, sd = 0): """ Produces *From Date*, *Until Date* kind of selection box. Suitable for search options. Parameters: - 'name' *string* - The base name of the produced selects - 'ln' *string* - the language to display """ # load the right message language _ = gettext_set_language(ln) box = """ <select name="%(name)sd"> <option value=""%(sel)s>%(any)s</option> """ % { 'name' : name, 'any' : _("any day"), 'sel' : self.tmpl_is_selected(sd, 0) } for day in range(1, 32): box += """<option value="%02d"%s>%02d</option>""" % (day, self.tmpl_is_selected(sd, day), day) box += """</select>""" # month box += """ <select name="%(name)sm"> <option value=""%(sel)s>%(any)s</option> """ % { 'name' : name, 'any' : _("any month"), 'sel' : self.tmpl_is_selected(sm, 0) } for mm, month in [(1, _("January")), (2, _("February")), (3, _("March")), (4, _("April")), \ (5, _("May")), (6, _("June")), (7, _("July")), (8, _("August")), \ (9, _("September")), (10, _("October")), (11, _("November")), (12, _("December"))]: box += """<option value="%02d"%s>%s</option>""" % (mm, self.tmpl_is_selected(sm, mm), month) box += """</select>""" # year box += """ <select name="%(name)sy"> <option value=""%(sel)s>%(any)s</option> """ % { 'name' : name, 'any' : _("any year"), 'sel' : self.tmpl_is_selected(sy, 0) } this_year = int(time.strftime("%Y", time.localtime())) for year in range(this_year-20, this_year+1): box += """<option value="%d"%s>%d</option>""" % (year, self.tmpl_is_selected(sy, year), year) box += """</select>""" return box def tmpl_inputdatetype(self, dt='', ln=CFG_SITE_LANG): """ Produces input date type selection box to choose added-or-modified date search option. Parameters: - 'dt' *string - date type (c=created, m=modified) - 'ln' *string* - the language to display """ # load the right message language _ = gettext_set_language(ln) box = """<select name="dt"> <option value="">%(added)s </option> <option value="m"%(sel)s>%(modified)s </option> </select> """ % { 'added': _("Added since:"), 'modified': _("Modified since:"), 'sel': self.tmpl_is_selected(dt, 'm'), } return box - def tmpl_narrowsearch(self, as, ln, type, father, + def tmpl_narrowsearch(self, aas, ln, type, father, has_grandchildren, sons, display_grandsons, grandsons): """ Creates list of collection descendants of type *type* under title *title*. - If as==1, then links to Advanced Search interfaces; otherwise Simple Search. + If aas==1, then links to Advanced Search interfaces; otherwise Simple Search. Suitable for 'Narrow search' and 'Focus on' boxes. Parameters: - - 'as' *bool* - Should we display an advanced search box? + - 'aas' *bool* - Should we display an advanced search box? - 'ln' *string* - The language to display - 'type' *string* - The type of the produced box (virtual collections or normal collections) - 'father' *collection* - The current collection - 'has_grandchildren' *bool* - If the current collection has grand children - 'sons' *list* - The list of the sub-collections (first level) - 'display_grandsons' *bool* - If the grand children collections should be displayed (2 level deep display) - 'grandsons' *list* - The list of sub-collections (second level) """ # load the right message language _ = gettext_set_language(ln) title = {'r': _("Narrow by collection:"), 'v': _("Focus on:")}[type] if has_grandchildren: style_prolog = "<strong>" style_epilog = "</strong>" else: style_prolog = "" style_epilog = "" out = """<table class="%(narrowsearchbox)s"> <thead> <tr> <th colspan="2" align="left" class="%(narrowsearchbox)sheader"> %(title)s </th> </tr> </thead> <tbody>""" % {'title' : title, 'narrowsearchbox': {'r': 'narrowsearchbox', 'v': 'focusonsearchbox'}[type]} # iterate through sons: i = 0 for son in sons: out += """<tr><td class="%(narrowsearchbox)sbody" valign="top">""" % \ { 'narrowsearchbox': {'r': 'narrowsearchbox', 'v': 'focusonsearchbox'}[type]} if type == 'r': if son.restricted_p() and son.restricted_p() != father.restricted_p(): out += """<input type="checkbox" name="c" value="%(name)s" /></td>""" % {'name' : cgi.escape(son.name) } else: out += """<input type="checkbox" name="c" value="%(name)s" checked="checked" /></td>""" % {'name' : cgi.escape(son.name) } else: out += '</td>' out += """<td valign="top">%(link)s%(recs)s """ % { - 'link': create_html_link(self.build_search_interface_url(c=son.name, ln=ln, as=as), + 'link': create_html_link(self.build_search_interface_url(c=son.name, ln=ln, aas=aas), {}, style_prolog + cgi.escape(son.get_name(ln)) + style_epilog), 'recs' : self.tmpl_nbrecs_info(son.nbrecs, ln=ln)} if son.restricted_p(): out += """ <small class="warning">[%(msg)s]</small> """ % { 'msg' : _("restricted") } if display_grandsons and len(grandsons[i]): # iterate trough grandsons: out += """<br />""" for grandson in grandsons[i]: out += """ <small>%(link)s%(nbrec)s</small> """ % { - 'link': create_html_link(self.build_search_interface_url(c=grandson.name, ln=ln, as=as), + 'link': create_html_link(self.build_search_interface_url(c=grandson.name, ln=ln, aas=aas), {}, cgi.escape(grandson.get_name(ln))), 'nbrec' : self.tmpl_nbrecs_info(grandson.nbrecs, ln=ln)} out += """</td></tr>""" i += 1 out += "</tbody></table>" return out def tmpl_searchalso(self, ln, engines_list, collection_id): _ = gettext_set_language(ln) box_name = _("Search also:") html = """<table cellspacing="0" cellpadding="0" border="0"> <tr><td valign="top"><table class="searchalsosearchbox"> <thead><tr><th colspan="2" align="left" class="searchalsosearchboxheader">%(box_name)s </th></tr></thead><tbody> """ % locals() for engine in engines_list: internal_name = engine.name name = _(internal_name) base_url = engine.base_url if external_collection_get_state(engine, collection_id) == 3: checked = ' checked="checked"' else: checked = '' html += """<tr><td class="searchalsosearchboxbody" valign="top"> <input type="checkbox" name="ec" id="%(id)s" value="%(internal_name)s" %(checked)s /></td> <td valign="top" class="searchalsosearchboxbody"> <div style="white-space: nowrap"><label for="%(id)s">%(name)s</label> <a href="%(base_url)s"> <img src="%(siteurl)s/img/external-icon-light-8x8.gif" border="0" alt="%(name)s"/></a> </div></td></tr>""" % \ { 'checked': checked, 'base_url': base_url, 'internal_name': internal_name, 'name': cgi.escape(name), 'id': "extSearch" + nmtoken_from_string(name), 'siteurl': CFG_SITE_URL,} html += """</tbody></table></td></tr></table>""" return html def tmpl_nbrecs_info(self, number, prolog=None, epilog=None, ln=CFG_SITE_LANG): """ Return information on the number of records. Parameters: - 'number' *string* - The number of records - 'prolog' *string* (optional) - An HTML code to prefix the number (if **None**, will be '<small class="nbdoccoll">(') - 'epilog' *string* (optional) - An HTML code to append to the number (if **None**, will be ')</small>') """ if number is None: number = 0 if prolog is None: prolog = ''' <small class="nbdoccoll">(''' if epilog is None: epilog = ''')</small>''' return prolog + self.tmpl_nice_number(number, ln) + epilog def tmpl_box_restricted_content(self, ln): """ Displays a box containing a *restricted content* message Parameters: - 'ln' *string* - The language to display """ # load the right message language _ = gettext_set_language(ln) return _("This collection is restricted. If you are authorized to access it, please click on the Search button.") def tmpl_box_no_records(self, ln): """ Displays a box containing a *no content* message Parameters: - 'ln' *string* - The language to display """ # load the right message language _ = gettext_set_language(ln) return _("This collection does not contain any document yet.") - def tmpl_instant_browse(self, as, ln, recids, more_link = None): + def tmpl_instant_browse(self, aas, ln, recids, more_link = None): """ Formats a list of records (given in the recids list) from the database. Parameters: - - 'as' *int* - Advanced Search interface or not (0 or 1) + - 'aas' *int* - Advanced Search interface or not (0 or 1) - 'ln' *string* - The language to display - 'recids' *list* - the list of records from the database - 'more_link' *string* - the "More..." link for the record. If not given, will not be displayed """ # load the right message language _ = gettext_set_language(ln) body = '''<table class="latestadditionsbox">''' for recid in recids: body += ''' <tr> <td class="latestadditionsboxtimebody">%(date)s</td> <td class="latestadditionsboxrecordbody"> <abbr class="unapi-id" title="%(recid)s"></abbr> %(body)s </td> </tr>''' % { 'recid': recid['id'], 'date': recid['date'], 'body': recid['body'] } body += "</table>" if more_link: body += '<div align="right"><small>' + \ create_html_link(more_link, {}, '[>> %s]' % _("more")) + \ '</small></div>' return ''' <table class="narrowsearchbox"> <thead> <tr> <th class="narrowsearchboxheader">%(header)s</th> </tr> </thead> <tbody> <tr> <td class="narrowsearchboxbody">%(body)s</td> </tr> </tbody> </table>''' % {'header' : _("Latest additions:"), 'body' : body, } def tmpl_searchwithin_select(self, ln, fieldname, selected, values): """ Produces 'search within' selection box for the current collection. Parameters: - 'ln' *string* - The language to display - 'fieldname' *string* - the name of the select box produced - 'selected' *string* - which of the values is selected - 'values' *list* - the list of values in the select """ out = '<select name="%(fieldname)s">' % {'fieldname': fieldname} if values: for pair in values: out += """<option value="%(value)s"%(selected)s>%(text)s</option>""" % { 'value' : cgi.escape(pair['value']), 'selected' : self.tmpl_is_selected(pair['value'], selected), 'text' : cgi.escape(pair['text']) } out += """</select>""" return out def tmpl_select(self, fieldname, values, selected=None, css_class=''): """ Produces a generic select box Parameters: - 'css_class' *string* - optional, a css class to display this select with - 'fieldname' *list* - the name of the select box produced - 'selected' *string* - which of the values is selected - 'values' *list* - the list of values in the select """ if css_class != '': class_field = ' class="%s"' % css_class else: class_field = '' out = '<select name="%(fieldname)s"%(class)s>' % { 'fieldname' : fieldname, 'class' : class_field } for pair in values: if pair.get('selected', False) or pair['value'] == selected: flag = ' selected="selected"' else: flag = '' out += '<option value="%(value)s"%(selected)s>%(text)s</option>' % { 'value' : cgi.escape(str(pair['value'])), 'selected' : flag, 'text' : cgi.escape(pair['text']) } out += """</select>""" return out def tmpl_record_links(self, recid, ln): """ Displays the *More info* and *Find similar* links for a record Parameters: - 'ln' *string* - The language to display - 'recid' *string* - the id of the displayed record """ # load the right message language _ = gettext_set_language(ln) out = '''<br /><span class="moreinfo">%(detailed)s - %(similar)s</span>''' % { 'detailed': create_html_link(self.build_search_url(recid=recid, ln=ln), {}, _("Detailed record"), {'class': "moreinfo"}), 'similar': create_html_link(self.build_search_url(p="recid:%d" % recid, rm='wrd', ln=ln), {}, _("Similar records"), {'class': "moreinfo"})} if CFG_BIBRANK_SHOW_CITATION_LINKS: num_timescited = get_cited_by_count(recid) if num_timescited: out += '''<span class="moreinfo"> - %s </span>''' % \ create_html_link(self.build_search_url(p='recid:%d' % recid, rm='citation', ln=ln), {}, _("Cited by %i records") % num_timescited, {'class': "moreinfo"}) return out def tmpl_record_body(self, titles, authors, dates, rns, abstracts, urls_u, urls_z, ln): """ Displays the "HTML basic" format of a record Parameters: - 'authors' *list* - the authors (as strings) - 'dates' *list* - the dates of publication - 'rns' *list* - the quicknotes for the record - 'abstracts' *list* - the abstracts for the record - 'urls_u' *list* - URLs to the original versions of the record - 'urls_z' *list* - Not used """ out = "" for title in titles: out += "<strong>%(title)s</strong> " % { 'title' : cgi.escape(title) } if authors: out += " / " for author in authors[:CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD]: out += '%s; ' % \ create_html_link(self.build_search_url(p=author, f='author', ln=ln), {}, cgi.escape(author)) if len(authors) > CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD: out += "<em>et al</em>" for date in dates: out += " %s." % cgi.escape(date) for rn in rns: out += """ <small class="quicknote">[%(rn)s]</small>""" % {'rn' : cgi.escape(rn)} for abstract in abstracts: out += "<br /><small>%(abstract)s [...]</small>" % {'abstract' : cgi.escape(abstract[:1+string.find(abstract, '.')]) } for idx in range(0, len(urls_u)): out += """<br /><small class="note"><a class="note" href="%(url)s">%(name)s</a></small>""" % { 'url' : urls_u[idx], 'name' : urls_u[idx] } return out def tmpl_search_in_bibwords(self, p, f, ln, nearest_box): """ Displays the *Words like current ones* links for a search Parameters: - 'p' *string* - Current search words - 'f' *string* - the fields in which the search was done - 'nearest_box' *string* - the HTML code for the "nearest_terms" box - most probably from a create_nearest_terms_box call """ # load the right message language _ = gettext_set_language(ln) out = '<p>' if f: out += _("Words nearest to %(x_word)s inside %(x_field)s in any collection are:") % {'x_word': '<em>' + cgi.escape(p) + '</em>', 'x_field': '<em>' + cgi.escape(f) + '</em>'} else: out += _("Words nearest to %(x_word)s in any collection are:") % {'x_word': '<em>' + cgi.escape(p) + '</em>'} out += '<br />' + nearest_box + '</p>' return out def tmpl_nearest_term_box(self, p, ln, f, terminfo, intro): """ Displays the *Nearest search terms* box Parameters: - 'p' *string* - Current search words - 'f' *string* - a collection description (if the search has been completed in a collection) - 'ln' *string* - The language to display - 'terminfo': tuple (term, hits, argd) for each near term - 'intro' *string* - the intro HTML to prefix the box with """ out = '''<table class="nearesttermsbox" cellpadding="0" cellspacing="0" border="0">''' for term, hits, argd in terminfo: if hits: hitsinfo = str(hits) else: hitsinfo = '-' term = cgi.escape(term) if term == p: # print search word for orientation: nearesttermsboxbody_class = "nearesttermsboxbodyselected" if hits > 0: term = create_html_link(self.build_search_url(argd), {}, term, {'class': "nearesttermsselected"}) else: nearesttermsboxbody_class = "nearesttermsboxbody" term = create_html_link(self.build_search_url(argd), {}, term, {'class': "nearestterms"}) out += '''\ <tr> <td class="%(nearesttermsboxbody_class)s" align="right">%(hits)s</td> <td class="%(nearesttermsboxbody_class)s" width="15"> </td> <td class="%(nearesttermsboxbody_class)s" align="left">%(term)s</td> </tr> ''' % {'hits': hitsinfo, 'nearesttermsboxbody_class': nearesttermsboxbody_class, 'term': term} out += "</table>" return intro + "<blockquote>" + out + "</blockquote>" def tmpl_browse_pattern(self, f, fn, ln, browsed_phrases_in_colls, colls, rg): """ Displays the *Nearest search terms* box Parameters: - 'f' *string* - field (*not* i18nized) - 'fn' *string* - field name (i18nized) - 'ln' *string* - The language to display - 'browsed_phrases_in_colls' *array* - the phrases to display - 'colls' *array* - the list of collection parameters of the search (c's) - 'rg' *int* - the number of records """ # load the right message language _ = gettext_set_language(ln) out = """<table class="searchresultsbox"> <thead> <tr> <th class="searchresultsboxheader" style="text-align: right;" width="15"> %(hits)s </th> <th class="searchresultsboxheader" width="15"> </th> <th class="searchresultsboxheader" style="text-align: left;"> %(fn)s </th> </tr> </thead> <tbody>""" % { 'hits' : _("Hits"), 'fn' : cgi.escape(fn) } if len(browsed_phrases_in_colls) == 1: # one hit only found: phrase, nbhits = browsed_phrases_in_colls[0][0], browsed_phrases_in_colls[0][1] query = {'c': colls, 'ln': ln, 'p': '"%s"' % phrase.replace('"', '\\"'), 'f': f, 'rg' : rg} out += """<tr> <td class="searchresultsboxbody" style="text-align: right;"> %(nbhits)s </td> <td class="searchresultsboxbody" width="15"> </td> <td class="searchresultsboxbody" style="text-align: left;"> %(link)s </td> </tr>""" % {'nbhits': nbhits, 'link': create_html_link(self.build_search_url(query), {}, cgi.escape(phrase))} elif len(browsed_phrases_in_colls) > 1: # first display what was found but the last one: for phrase, nbhits in browsed_phrases_in_colls[:-1]: query = {'c': colls, 'ln': ln, 'p': '"%s"' % phrase.replace('"', '\\"'), 'f': f, 'rg' : rg} out += """<tr> <td class="searchresultsboxbody" style="text-align: right;"> %(nbhits)s </td> <td class="searchresultsboxbody" width="15"> </td> <td class="searchresultsboxbody" style="text-align: left;"> %(link)s </td> </tr>""" % {'nbhits' : nbhits, 'link': create_html_link(self.build_search_url(query), {}, cgi.escape(phrase))} # now display last hit as "previous term": phrase, nbhits = browsed_phrases_in_colls[0] query_previous = {'c': colls, 'ln': ln, 'p': '"%s"' % phrase.replace('"', '\\"'), 'f': f, 'rg' : rg} # now display last hit as "next term": phrase, nbhits = browsed_phrases_in_colls[-1] query_next = {'c': colls, 'ln': ln, 'p': '"%s"' % phrase.replace('"', '\\"'), 'f': f, 'rg' : rg} out += """<tr><td colspan="2" class="normal"> </td> <td class="normal"> %(link_previous)s <img src="%(siteurl)s/img/sp.gif" alt="" border="0" /> <img src="%(siteurl)s/img/sn.gif" alt="" border="0" /> %(link_next)s </td> </tr>""" % {'link_previous': create_html_link(self.build_search_url(query_previous, action='browse'), {}, _("Previous")), 'link_next': create_html_link(self.build_search_url(query_next, action='browse'), {}, _("next")), 'siteurl' : CFG_SITE_URL} out += """</tbody> </table>""" return out - def tmpl_search_box(self, ln, as, cc, cc_intl, ot, sp, + def tmpl_search_box(self, ln, aas, cc, cc_intl, ot, sp, action, fieldslist, f1, f2, f3, m1, m2, m3, p1, p2, p3, op1, op2, rm, p, f, coll_selects, d1y, d2y, d1m, d2m, d1d, d2d, dt, sort_fields, sf, so, ranks, sc, rg, formats, of, pl, jrec, ec, show_colls=True): """ Displays the *Nearest search terms* box Parameters: - 'ln' *string* - The language to display - - 'as' *bool* - Should we display an advanced search box? -1 -> 1, from simpler to more advanced + - 'aas' *bool* - Should we display an advanced search box? -1 -> 1, from simpler to more advanced - 'cc_intl' *string* - the i18nized current collection name, used for display - 'cc' *string* - the internal current collection name - 'ot', 'sp' *string* - hidden values - 'action' *string* - the action demanded by the user - 'fieldslist' *list* - the list of all fields available, for use in select within boxes in advanced search - 'p, f, f1, f2, f3, m1, m2, m3, p1, p2, p3, op1, op2, op3, rm' *strings* - the search parameters - 'coll_selects' *array* - a list of lists, each containing the collections selects to display - 'd1y, d2y, d1m, d2m, d1d, d2d' *int* - the search between dates - 'dt' *string* - the dates' types (creation dates, modification dates) - 'sort_fields' *array* - the select information for the sort fields - 'sf' *string* - the currently selected sort field - 'so' *string* - the currently selected sort order ("a" or "d") - 'ranks' *array* - ranking methods - 'rm' *string* - selected ranking method - 'sc' *string* - split by collection or not - 'rg' *string* - selected results/page - 'formats' *array* - available output formats - 'of' *string* - the selected output format - 'pl' *string* - `limit to' search pattern - show_colls *bool* - show cc_intl in page title, and propose coll selection box? """ # load the right message language _ = gettext_set_language(ln) # These are hidden fields the user does not manipulate # directly - if as == -1: + if aas == -1: argd = drop_default_urlargd({ - 'ln': ln, 'as': as, + 'ln': ln, 'aas': aas, 'ot': ot, 'sp': sp, 'ec': ec, }, self.search_results_default_urlargd) else: argd = drop_default_urlargd({ - 'cc': cc, 'ln': ln, 'as': as, + 'cc': cc, 'ln': ln, 'aas': aas, 'ot': ot, 'sp': sp, 'ec': ec, }, self.search_results_default_urlargd) out = "" if show_colls: # display cc name if asked for out += ''' <h1 class="headline">%(ccname)s</h1>''' % {'ccname' : cgi.escape(cc_intl),} out += ''' <form name="search" action="%(siteurl)s/search" method="get"> ''' % {'siteurl' : CFG_SITE_URL} # Only add non-default hidden values for field, value in argd.items(): out += self.tmpl_input_hidden(field, value) leadingtext = _("Search") if action == 'browse': leadingtext = _("Browse") - if as == 1: + if aas == 1: # print Advanced Search form: # define search box elements: out += ''' <table class="searchbox"> <thead> <tr> <th colspan="3" class="searchboxheader"> %(leading)s: </th> </tr> </thead> <tbody> <tr valign="top" style="white-space:nowrap;"> <td class="searchboxbody">%(matchbox1)s <input type="text" name="p1" size="%(sizepattern)d" value="%(p1)s" /> </td> <td class="searchboxbody">%(searchwithin1)s</td> <td class="searchboxbody">%(andornot1)s</td> </tr> <tr valign="top"> <td class="searchboxbody">%(matchbox2)s <input type="text" name="p2" size="%(sizepattern)d" value="%(p2)s" /> </td> <td class="searchboxbody">%(searchwithin2)s</td> <td class="searchboxbody">%(andornot2)s</td> </tr> <tr valign="top"> <td class="searchboxbody">%(matchbox3)s <input type="text" name="p3" size="%(sizepattern)d" value="%(p3)s" /> </td> <td class="searchboxbody">%(searchwithin3)s</td> <td class="searchboxbody" style="white-space:nowrap;"> <input class="formbutton" type="submit" name="action_search" value="%(search)s" /> <input class="formbutton" type="submit" name="action_browse" value="%(browse)s" /> </td> </tr> <tr valign="bottom"> <td colspan="3" align="right" class="searchboxbody"> <small> <a href="%(siteurl)s/help/search-tips%(langlink)s">%(search_tips)s</a> :: %(simple_search)s </small> </td> </tr> </tbody> </table> ''' % { 'simple_search': create_html_link(self.build_search_url(p=p1, f=f1, rm=rm, cc=cc, ln=ln, jrec=jrec, rg=rg), {}, _("Simple Search")), 'leading' : leadingtext, 'sizepattern' : CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH, 'matchbox1' : self.tmpl_matchtype_box('m1', m1, ln=ln), 'p1' : cgi.escape(p1,1), 'searchwithin1' : self.tmpl_searchwithin_select( ln = ln, fieldname = 'f1', selected = f1, values = self._add_mark_to_field(value=f1, fields=fieldslist, ln=ln) ), 'andornot1' : self.tmpl_andornot_box( name = 'op1', value = op1, ln = ln ), 'matchbox2' : self.tmpl_matchtype_box('m2', m2, ln=ln), 'p2' : cgi.escape(p2,1), 'searchwithin2' : self.tmpl_searchwithin_select( ln = ln, fieldname = 'f2', selected = f2, values = self._add_mark_to_field(value=f2, fields=fieldslist, ln=ln) ), 'andornot2' : self.tmpl_andornot_box( name = 'op2', value = op2, ln = ln ), 'matchbox3' : self.tmpl_matchtype_box('m3', m3, ln=ln), 'p3' : cgi.escape(p3,1), 'searchwithin3' : self.tmpl_searchwithin_select( ln = ln, fieldname = 'f3', selected = f3, values = self._add_mark_to_field(value=f3, fields=fieldslist, ln=ln) ), 'search' : _("Search"), 'browse' : _("Browse"), 'siteurl' : CFG_SITE_URL, 'ln' : ln, 'langlink': ln != CFG_SITE_LANG and '?ln=' + ln or '', 'search_tips': _("Search Tips") } - elif as == 0: + elif aas == 0: # print Simple Search form: out += ''' <table class="searchbox"> <thead> <tr> <th colspan="3" class="searchboxheader"> %(leading)s: </th> </tr> </thead> <tbody> <tr valign="top"> <td class="searchboxbody"><input type="text" name="p" size="%(sizepattern)d" value="%(p)s" /></td> <td class="searchboxbody">%(searchwithin)s</td> <td class="searchboxbody"> <input class="formbutton" type="submit" name="action_search" value="%(search)s" /> <input class="formbutton" type="submit" name="action_browse" value="%(browse)s" /> </td> </tr> <tr valign="bottom"> <td colspan="3" align="right" class="searchboxbody"> <small> <a href="%(siteurl)s/help/search-tips%(langlink)s">%(search_tips)s</a> :: %(advanced_search)s </small> </td> </tr> </tbody> </table> ''' % { 'advanced_search': create_html_link(self.build_search_url(p1=p, f1=f, rm=rm, - as=max(CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES), + aas=max(CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES), cc=cc, jrec=jrec, ln=ln, rg=rg), {}, _("Advanced Search")), 'leading' : leadingtext, 'sizepattern' : CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH, 'p' : cgi.escape(p, 1), 'searchwithin' : self.tmpl_searchwithin_select( ln = ln, fieldname = 'f', selected = f, values = self._add_mark_to_field(value=f, fields=fieldslist, ln=ln) ), 'search' : _("Search"), 'browse' : _("Browse"), 'siteurl' : CFG_SITE_URL, 'ln' : ln, 'langlink': ln != CFG_SITE_LANG and '?ln=' + ln or '', 'search_tips': _("Search Tips") } else: # EXPERIMENTAL # print light search form: search_in = '' if cc_intl != CFG_SITE_NAME_INTL.get(ln, CFG_SITE_NAME): search_in = ''' <input type="radio" name="cc" value="%(collection_id)s" id="searchCollection" checked="checked"/> <label for="searchCollection">%(search_in_collection_name)s</label> <input type="radio" name="cc" value="%(root_collection_name)s" id="searchEverywhere" /> <label for="searchEverywhere">%(search_everywhere)s</label> ''' % {'search_in_collection_name': _("Search in %(x_collection_name)s") % \ {'x_collection_name': cgi.escape(cc_intl)}, 'collection_id': cc, 'root_collection_name': CFG_SITE_NAME, 'search_everywhere': _("Search everywhere")} out += ''' <table> <tr valign="top"> <td class="searchboxbody"><input type="text" name="p" size="%(sizepattern)d" value="%(p)s" /></td> <td class="searchboxbody"> <input class="formbutton" type="submit" name="action_search" value="%(search)s" /> </td> <td class="searchboxbody" align="left" rowspan="2" valign="top"> <small><small> <a href="%(siteurl)s/help/search-tips%(langlink)s">%(search_tips)s</a><br/> %(advanced_search)s </td> </tr> </table> <small>%(search_in)s</small> ''' % { 'advanced_search': create_html_link(self.build_search_url(p1=p, f1=f, rm=rm, - as=max(CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES), + aas=max(CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES), cc=cc, jrec=jrec, ln=ln, rg=rg), {}, _("Advanced Search")), 'leading' : leadingtext, 'sizepattern' : CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH, 'p' : cgi.escape(p, 1), 'searchwithin' : self.tmpl_searchwithin_select( ln = ln, fieldname = 'f', selected = f, values = self._add_mark_to_field(value=f, fields=fieldslist, ln=ln) ), 'search' : _("Search"), 'browse' : _("Browse"), 'siteurl' : CFG_SITE_URL, 'ln' : ln, 'langlink': ln != CFG_SITE_LANG and '?ln=' + ln or '', 'search_tips': _("Search Tips"), 'search_in': search_in } ## secondly, print Collection(s) box: - if show_colls and as > -1: + if show_colls and aas > -1: # display collections only if there is more than one selects = '' for sel in coll_selects: selects += self.tmpl_select(fieldname='c', values=sel) out += """ <table class="searchbox"> <thead> <tr> <th colspan="3" class="searchboxheader"> %(leading)s %(msg_coll)s: </th> </tr> </thead> <tbody> <tr valign="bottom"> <td valign="top" class="searchboxbody"> %(colls)s </td> </tr> </tbody> </table> """ % { 'leading' : leadingtext, 'msg_coll' : _("collections"), 'colls' : selects, } ## thirdly, print search limits, if applicable: if action != _("Browse") and pl: out += """<table class="searchbox"> <thead> <tr> <th class="searchboxheader"> %(limitto)s </th> </tr> </thead> <tbody> <tr valign="bottom"> <td class="searchboxbody"> <input type="text" name="pl" size="%(sizepattern)d" value="%(pl)s" /> </td> </tr> </tbody> </table>""" % { 'limitto' : _("Limit to:"), 'sizepattern' : CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH, 'pl' : cgi.escape(pl, 1), } ## fourthly, print from/until date boxen, if applicable: if action == _("Browse") or (d1y==0 and d1m==0 and d1d==0 and d2y==0 and d2m==0 and d2d==0): pass # do not need it else: cell_6_a = self.tmpl_inputdatetype(dt, ln) + self.tmpl_inputdate("d1", ln, d1y, d1m, d1d) cell_6_b = self.tmpl_inputdate("d2", ln, d2y, d2m, d2d) out += """<table class="searchbox"> <thead> <tr> <th class="searchboxheader"> %(added)s </th> <th class="searchboxheader"> %(until)s </th> </tr> </thead> <tbody> <tr valign="bottom"> <td class="searchboxbody">%(added_or_modified)s %(date1)s</td> <td class="searchboxbody">%(date2)s</td> </tr> </tbody> </table>""" % { 'added' : _("Added/modified since:"), 'until' : _("until:"), 'added_or_modified': self.tmpl_inputdatetype(dt, ln), 'date1' : self.tmpl_inputdate("d1", ln, d1y, d1m, d1d), 'date2' : self.tmpl_inputdate("d2", ln, d2y, d2m, d2d), } ## fifthly, print Display results box, including sort/rank, formats, etc: - if action != _("Browse") and as > -1: + if action != _("Browse") and aas > -1: rgs = [] for i in [10, 25, 50, 100, 250, 500]: if i <= CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS: rgs.append({ 'value' : i, 'text' : "%d %s" % (i, _("results"))}) # enrich sort fields list if we are sorting by some MARC tag: sort_fields = self._add_mark_to_field(value=sf, fields=sort_fields, ln=ln) # create sort by HTML box: out += """<table class="searchbox"> <thead> <tr> <th class="searchboxheader"> %(sort_by)s </th> <th class="searchboxheader"> %(display_res)s </th> <th class="searchboxheader"> %(out_format)s </th> </tr> </thead> <tbody> <tr valign="bottom"> <td valign="top" class="searchboxbody"> %(select_sf)s %(select_so)s %(select_rm)s </td> <td valign="top" class="searchboxbody"> %(select_rg)s %(select_sc)s </td> <td valign="top" class="searchboxbody">%(select_of)s</td> </tr> </tbody> </table>""" % { 'sort_by' : _("Sort by:"), 'display_res' : _("Display results:"), 'out_format' : _("Output format:"), 'select_sf' : self.tmpl_select(fieldname = 'sf', values = sort_fields, selected = sf, css_class = 'address'), 'select_so' : self.tmpl_select(fieldname = 'so', values = [{ 'value' : 'a', 'text' : _("asc.") }, { 'value' : 'd', 'text' : _("desc.") }], selected = so, css_class = 'address'), 'select_rm' : self.tmpl_select(fieldname = 'rm', values = ranks, selected = rm, css_class = 'address'), 'select_rg' : self.tmpl_select(fieldname = 'rg', values = rgs, selected = rg, css_class = 'address'), 'select_sc' : self.tmpl_select(fieldname = 'sc', values = [{ 'value' : 0, 'text' : _("single list") }, { 'value' : 1, 'text' : _("split by collection") }], selected = sc, css_class = 'address'), 'select_of' : self.tmpl_searchwithin_select( ln = ln, fieldname = 'of', selected = of, values = self._add_mark_to_field(value=of, fields=formats, chars=3, ln=ln) ), } ## last but not least, print end of search box: out += """</form>""" return out def tmpl_input_hidden(self, name, value): "Produces the HTML code for a hidden field " if isinstance(value, list): list_input = [self.tmpl_input_hidden(name, val) for val in value] return "\n".join(list_input) + # # Treat `as', `aas' arguments specially: + if name == 'aas': + name = 'as' + return """<input type="hidden" name="%(name)s" value="%(value)s" />""" % { 'name' : cgi.escape(str(name), 1), 'value' : cgi.escape(str(value), 1), } def _add_mark_to_field(self, value, fields, ln, chars=1): """Adds the current value as a MARC tag in the fields array Useful for advanced search""" # load the right message language _ = gettext_set_language(ln) out = fields if value and str(value[0:chars]).isdigit(): out.append({'value' : value, 'text' : str(value) + " " + _("MARC tag") }) return out def tmpl_search_pagestart(self, ln) : "page start for search page. Will display after the page header" return """<div class="pagebody"><div class="pagebodystripemiddle">""" def tmpl_search_pageend(self, ln) : "page end for search page. Will display just before the page footer" return """</div></div>""" def tmpl_print_warning(self, msg, type, prologue, epilogue): """Prints warning message and flushes output. Parameters: - 'msg' *string* - The message string - 'type' *string* - the warning type - 'prologue' *string* - HTML code to display before the warning - 'epilogue' *string* - HTML code to display after the warning """ out = '\n%s<span class="quicknote">' % (prologue) if type: out += '%s: ' % type out += '%s</span>%s' % (msg, epilogue) return out def tmpl_print_search_info(self, ln, middle_only, collection, collection_name, collection_id, - as, sf, so, rm, rg, nb_found, of, ot, p, f, f1, + aas, sf, so, rm, rg, nb_found, of, ot, p, f, f1, f2, f3, m1, m2, m3, op1, op2, p1, p2, p3, d1y, d1m, d1d, d2y, d2m, d2d, dt, all_fieldcodes, cpu_time, pl_in_url, jrec, sc, sp): """Prints stripe with the information on 'collection' and 'nb_found' results and CPU time. Also, prints navigation links (beg/next/prev/end) inside the results set. If middle_only is set to 1, it will only print the middle box information (beg/netx/prev/end/etc) links. This is suitable for displaying navigation links at the bottom of the search results page. Parameters: - 'ln' *string* - The language to display - 'middle_only' *bool* - Only display parts of the interface - 'collection' *string* - the collection name - 'collection_name' *string* - the i18nized current collection name - - 'as' *bool* - if we display the advanced search interface + - 'aas' *bool* - if we display the advanced search interface - 'sf' *string* - the currently selected sort format - 'so' *string* - the currently selected sort order ("a" or "d") - 'rm' *string* - selected ranking method - 'rg' *int* - selected results/page - 'nb_found' *int* - number of results found - 'of' *string* - the selected output format - 'ot' *string* - hidden values - 'p' *string* - Current search words - 'f' *string* - the fields in which the search was done - 'f1, f2, f3, m1, m2, m3, p1, p2, p3, op1, op2' *strings* - the search parameters - 'jrec' *int* - number of first record on this page - 'd1y, d2y, d1m, d2m, d1d, d2d' *int* - the search between dates - 'dt' *string* the dates' type (creation date, modification date) - 'all_fieldcodes' *array* - all the available fields - 'cpu_time' *float* - the time of the query in seconds """ # load the right message language _ = gettext_set_language(ln) out = "" # left table cells: print collection name if not middle_only: out += ''' <a name="%(collection_id)s"></a> <form action="%(siteurl)s/search" method="get"> <table class="searchresultsbox"><tr><td class="searchresultsboxheader" align="left"> <strong><big>%(collection_link)s</big></strong></td> ''' % { 'collection_id': collection_id, 'siteurl' : CFG_SITE_URL, - 'collection_link': create_html_link(self.build_search_interface_url(c=collection, as=as, ln=ln), + 'collection_link': create_html_link(self.build_search_interface_url(c=collection, aas=aas, ln=ln), {}, cgi.escape(collection_name)) } else: out += """ <form action="%(siteurl)s/search" method="get"><div align="center"> """ % { 'siteurl' : CFG_SITE_URL } # middle table cell: print beg/next/prev/end arrows: if not middle_only: out += """<td class="searchresultsboxheader" align="center"> %(recs_found)s """ % { 'recs_found' : _("%s records found") % ('<strong>' + self.tmpl_nice_number(nb_found, ln) + '</strong>') } else: out += "<small>" if nb_found > rg: out += "" + cgi.escape(collection_name) + " : " + _("%s records found") % ('<strong>' + self.tmpl_nice_number(nb_found, ln) + '</strong>') + " " if nb_found > rg: # navig.arrows are needed, since we have many hits query = {'p': p, 'f': f, 'cc': collection, 'sf': sf, 'so': so, 'sp': sp, 'rm': rm, 'of': of, 'ot': ot, - 'as': as, 'ln': ln, + 'aas': aas, 'ln': ln, 'p1': p1, 'p2': p2, 'p3': p3, 'f1': f1, 'f2': f2, 'f3': f3, 'm1': m1, 'm2': m2, 'm3': m3, 'op1': op1, 'op2': op2, 'sc': 0, 'd1y': d1y, 'd1m': d1m, 'd1d': d1d, 'd2y': d2y, 'd2m': d2m, 'd2d': d2d, 'dt': dt, } # @todo here def img(gif, txt): return '<img src="%(siteurl)s/img/%(gif)s.gif" alt="%(txt)s" border="0" />' % { 'txt': txt, 'gif': gif, 'siteurl': CFG_SITE_URL} if jrec-rg > 1: out += create_html_link(self.build_search_url(query, jrec=1, rg=rg), {}, img('sb', _("begin")), {'class': 'img'}) if jrec > 1: out += create_html_link(self.build_search_url(query, jrec=max(jrec-rg, 1), rg=rg), {}, img('sp', _("previous")), {'class': 'img'}) if jrec+rg-1 < nb_found: out += "%d - %d" % (jrec, jrec+rg-1) else: out += "%d - %d" % (jrec, nb_found) if nb_found >= jrec+rg: out += create_html_link(self.build_search_url(query, jrec=jrec+rg, rg=rg), {}, img('sn', _("next")), {'class':'img'}) if nb_found >= jrec+rg+rg: out += create_html_link(self.build_search_url(query, jrec=nb_found-rg+1, rg=rg), {}, img('se', _("end")), {'class': 'img'}) # still in the navigation part cc = collection sc = 0 - for var in ['p', 'cc', 'f', 'sf', 'so', 'of', 'rg', 'as', 'ln', 'p1', 'p2', 'p3', 'f1', 'f2', 'f3', 'm1', 'm2', 'm3', 'op1', 'op2', 'sc', 'd1y', 'd1m', 'd1d', 'd2y', 'd2m', 'd2d', 'dt']: + for var in ['p', 'cc', 'f', 'sf', 'so', 'of', 'rg', 'aas', 'ln', 'p1', 'p2', 'p3', 'f1', 'f2', 'f3', 'm1', 'm2', 'm3', 'op1', 'op2', 'sc', 'd1y', 'd1m', 'd1d', 'd2y', 'd2m', 'd2d', 'dt']: out += self.tmpl_input_hidden(name = var, value = vars()[var]) for var in ['ot', 'sp', 'rm']: if vars()[var]: out += self.tmpl_input_hidden(name = var, value = vars()[var]) if pl_in_url: fieldargs = cgi.parse_qs(pl_in_url) for fieldcode in all_fieldcodes: # get_fieldcodes(): if fieldargs.has_key(fieldcode): for val in fieldargs[fieldcode]: out += self.tmpl_input_hidden(name = fieldcode, value = val) out += """ %(jump)s <input type="text" name="jrec" size="4" value="%(jrec)d" />""" % { 'jump' : _("jump to record:"), 'jrec' : jrec, } if not middle_only: out += "</td>" else: out += "</small>" # right table cell: cpu time info if not middle_only: if cpu_time > -1: out += """<td class="searchresultsboxheader" align="right"><small>%(time)s</small> </td>""" % { 'time' : _("Search took %s seconds.") % ('%.2f' % cpu_time), } out += "</tr></table>" else: out += "</div>" out += "</form>" return out def tmpl_nice_number(self, number, ln=CFG_SITE_LANG, thousands_separator=',', max_ndigits_after_dot=None): """ Return nicely printed number NUMBER in language LN using given THOUSANDS_SEPARATOR character. If max_ndigits_after_dot is specified and the number is float, the number is rounded by taking in consideration up to max_ndigits_after_dot digit after the dot. This version does not pay attention to locale. See tmpl_nice_number_via_locale(). """ if type(number) is float: if max_ndigits_after_dot is not None: number = round(number, max_ndigits_after_dot) int_part, frac_part = str(number).split('.') return '%s.%s' % (self.tmpl_nice_number(int(int_part), ln, thousands_separator), frac_part) else: chars_in = list(str(number)) number = len(chars_in) chars_out = [] for i in range(0, number): if i % 3 == 0 and i != 0: chars_out.append(thousands_separator) chars_out.append(chars_in[number-i-1]) chars_out.reverse() return ''.join(chars_out) def tmpl_nice_number_via_locale(self, number, ln=CFG_SITE_LANG): """ Return nicely printed number NUM in language LN using the locale. See also version tmpl_nice_number(). """ if number is None: return None # Temporarily switch the numeric locale to the requested one, and format the number # In case the system has no locale definition, use the vanilla form ol = locale.getlocale(locale.LC_NUMERIC) try: locale.setlocale(locale.LC_NUMERIC, self.tmpl_localemap.get(ln, self.tmpl_default_locale)) except locale.Error: return str(number) try: number = locale.format('%d', number, True) except TypeError: return str(number) locale.setlocale(locale.LC_NUMERIC, ol) return number def tmpl_record_format_htmlbrief_header(self, ln): """Returns the header of the search results list when output is html brief. Note that this function is called for each collection results when 'split by collection' is enabled. See also: tmpl_record_format_htmlbrief_footer(..), tmpl_record_format_htmlbrief_body(..) Parameters: - 'ln' *string* - The language to display """ # load the right message language _ = gettext_set_language(ln) out = """ <form action="%(siteurl)s/yourbaskets/add" method="post"> <table> """ % { 'siteurl' : CFG_SITE_URL, } return out def tmpl_record_format_htmlbrief_footer(self, ln): """Returns the footer of the search results list when output is html brief. Note that this function is called for each collection results when 'split by collection' is enabled. See also: tmpl_record_format_htmlbrief_header(..), tmpl_record_format_htmlbrief_body(..) Parameters: - 'ln' *string* - The language to display """ # load the right message language _ = gettext_set_language(ln) out = """</table> <br /><input class="formbutton" type="submit" name="action" value="%(basket)s" /> </form>""" % { 'basket' : _("ADD TO BASKET") } return out def tmpl_record_format_htmlbrief_body(self, ln, recid, row_number, relevance, record, relevances_prologue, relevances_epilogue): """Returns the html brief format of one record. Used in the search results list for each record. See also: tmpl_record_format_htmlbrief_header(..), tmpl_record_format_htmlbrief_footer(..) Parameters: - 'ln' *string* - The language to display - 'row_number' *int* - The position of this record in the list - 'recid' *int* - The recID - 'relevance' *string* - The relevance of the record - 'record' *string* - The formatted record - 'relevances_prologue' *string* - HTML code to prepend the relevance indicator - 'relevances_epilogue' *string* - HTML code to append to the relevance indicator (used mostly for formatting) """ # load the right message language _ = gettext_set_language(ln) out = """ <tr><td valign="top" align="right" style="white-space: nowrap;"> <input name="recid" type="checkbox" value="%(recid)s" /> <abbr class="unapi-id" title="%(recid)s"></abbr> %(number)s. """ % {'recid': recid, 'number': row_number} if relevance: out += """<br /><div class="rankscoreinfo"><a title="rank score">%(prologue)s%(relevance)s%(epilogue)s</a></div>""" % { 'prologue' : relevances_prologue, 'epilogue' : relevances_epilogue, 'relevance' : relevance } out += """</td><td valign="top">%s</td></tr>""" % record return out def tmpl_print_results_overview(self, ln, results_final_nb_total, cpu_time, results_final_nb, colls, ec): """Prints results overview box with links to particular collections below. Parameters: - 'ln' *string* - The language to display - 'results_final_nb_total' *int* - The total number of hits for the query - 'colls' *array* - The collections with hits, in the format: - 'coll[code]' *string* - The code of the collection (canonical name) - 'coll[name]' *string* - The display name of the collection - 'results_final_nb' *array* - The number of hits, indexed by the collection codes: - 'cpu_time' *string* - The time the query took - 'url_args' *string* - The rest of the search query - 'ec' *array* - selected external collections """ if len(colls) == 1 and not ec: # if one collection only and no external collections, print nothing: return "" # load the right message language _ = gettext_set_language(ln) # first find total number of hits: out = """<table class="searchresultsbox"> <thead><tr><th class="searchresultsboxheader">%(founds)s</th></tr></thead> <tbody><tr><td class="searchresultsboxbody"> """ % { 'founds' : _("%(x_fmt_open)sResults overview:%(x_fmt_close)s Found %(x_nb_records)s records in %(x_nb_seconds)s seconds.") %\ {'x_fmt_open': '<strong>', 'x_fmt_close': '</strong>', 'x_nb_records': '<strong>' + self.tmpl_nice_number(results_final_nb_total, ln) + '</strong>', 'x_nb_seconds': '%.2f' % cpu_time} } # then print hits per collection: for coll in colls: if results_final_nb.has_key(coll['code']) and results_final_nb[coll['code']] > 0: out += '''<strong><a href="#%(coll)s">%(coll_name)s</a></strong>, <a href="#%(coll)s">%(number)s</a><br />''' % { 'coll' : coll['id'], 'coll_name' : cgi.escape(coll['name']), 'number' : _("%s records found") % ('<strong>' + self.tmpl_nice_number(results_final_nb[coll['code']], ln) + '</strong>') } out += "</td></tr></tbody></table>" return out def tmpl_print_searchresultbox(self, header, body): """print a nicely formatted box for search results """ #_ = gettext_set_language(ln) # first find total number of hits: out = '<table class="searchresultsbox"><thead><tr><th class="searchresultsboxheader">'+header+'</th></tr></thead><tbody><tr><td class="searchresultsboxbody">'+body+'</td></tr></tbody></table>' return out def tmpl_search_no_boolean_hits(self, ln, nearestterms): """No hits found, proposes alternative boolean queries Parameters: - 'ln' *string* - The language to display - 'nearestterms' *array* - Parts of the interface to display, in the format: - 'nearestterms[nbhits]' *int* - The resulting number of hits - 'nearestterms[url_args]' *string* - The search parameters - 'nearestterms[p]' *string* - The search terms """ # load the right message language _ = gettext_set_language(ln) out = _("Boolean query returned no hits. Please combine your search terms differently.") out += '''<blockquote><table class="nearesttermsbox" cellpadding="0" cellspacing="0" border="0">''' for term, hits, argd in nearestterms: out += '''\ <tr> <td class="nearesttermsboxbody" align="right">%(hits)s</td> <td class="nearesttermsboxbody" width="15"> </td> <td class="nearesttermsboxbody" align="left"> %(link)s </td> </tr>''' % {'hits' : hits, 'link': create_html_link(self.build_search_url(argd), {}, cgi.escape(term), {'class': "nearestterms"})} out += """</table></blockquote>""" return out def tmpl_similar_author_names(self, authors, ln): """No hits found, proposes alternative boolean queries Parameters: - 'authors': a list of (name, hits) tuples - 'ln' *string* - The language to display """ # load the right message language _ = gettext_set_language(ln) out = '''<a name="googlebox"></a> <table class="googlebox"><tr><th colspan="2" class="googleboxheader">%(similar)s</th></tr>''' % { 'similar' : _("See also: similar author names") } for author, hits in authors: out += '''\ <tr> <td class="googleboxbody">%(nb)d</td> <td class="googleboxbody">%(link)s</td> </tr>''' % {'link': create_html_link( self.build_search_url(p=author, f='author', ln=ln), {}, cgi.escape(author), {'class':"google"}), 'nb' : hits} out += """</table>""" return out def tmpl_print_record_detailed(self, recID, ln): """Displays a detailed on-the-fly record Parameters: - 'ln' *string* - The language to display - 'recID' *int* - The record id """ # okay, need to construct a simple "Detailed record" format of our own: out = "<p> " # secondly, title: titles = get_fieldvalues(recID, "245__a") for title in titles: out += "<p><center><big><strong>%s</strong></big></center></p>" % cgi.escape(title) # thirdly, authors: authors = get_fieldvalues(recID, "100__a") + get_fieldvalues(recID, "700__a") if authors: out += "<p><center>" for author in authors: out += '%s; ' % create_html_link(self.build_search_url( ln=ln, p=author, f='author'), {}, cgi.escape(author)) out += "</center></p>" # fourthly, date of creation: dates = get_fieldvalues(recID, "260__c") for date in dates: out += "<p><center><small>%s</small></center></p>" % date # fifthly, abstract: abstracts = get_fieldvalues(recID, "520__a") for abstract in abstracts: out += """<p style="margin-left: 15%%; width: 70%%"> <small><strong>Abstract:</strong> %s</small></p>""" % abstract # fifthly bis, keywords: keywords = get_fieldvalues(recID, "6531_a") if len(keywords): out += """<p style="margin-left: 15%%; width: 70%%"> <small><strong>Keyword(s):</strong>""" for keyword in keywords: out += '%s; ' % create_html_link( self.build_search_url(ln=ln, p=keyword, f='keyword'), {}, cgi.escape(keyword)) out += '</small></p>' # fifthly bis bis, published in: prs_p = get_fieldvalues(recID, "909C4p") prs_v = get_fieldvalues(recID, "909C4v") prs_y = get_fieldvalues(recID, "909C4y") prs_n = get_fieldvalues(recID, "909C4n") prs_c = get_fieldvalues(recID, "909C4c") for idx in range(0, len(prs_p)): out += """<p style="margin-left: 15%%; width: 70%%"> <small><strong>Publ. in:</strong> %s""" % prs_p[idx] if prs_v and prs_v[idx]: out += """<strong>%s</strong>""" % prs_v[idx] if prs_y and prs_y[idx]: out += """(%s)""" % prs_y[idx] if prs_n and prs_n[idx]: out += """, no.%s""" % prs_n[idx] if prs_c and prs_c[idx]: out += """, p.%s""" % prs_c[idx] out += """.</small></p>""" # sixthly, fulltext link: urls_z = get_fieldvalues(recID, "8564_z") urls_u = get_fieldvalues(recID, "8564_u") for idx in range(0, len(urls_u)): link_text = "URL" try: if urls_z[idx]: link_text = urls_z[idx] except IndexError: pass out += """<p style="margin-left: 15%%; width: 70%%"> <small><strong>%s:</strong> <a href="%s">%s</a></small></p>""" % (link_text, urls_u[idx], urls_u[idx]) # print some white space at the end: out += "<br /><br />" return out def tmpl_print_record_list_for_similarity_boxen(self, title, recID_score_list, ln=CFG_SITE_LANG): """Print list of records in the "hs" (HTML Similarity) format for similarity boxes. RECID_SCORE_LIST is a list of (recID1, score1), (recID2, score2), etc. """ from invenio.search_engine import print_record, record_public_p recID_score_list_to_be_printed = [] # firstly find 5 first public records to print: nb_records_to_be_printed = 0 nb_records_seen = 0 while nb_records_to_be_printed < 5 and nb_records_seen < len(recID_score_list) and nb_records_seen < 50: # looking through first 50 records only, picking first 5 public ones (recID, score) = recID_score_list[nb_records_seen] nb_records_seen += 1 if record_public_p(recID): nb_records_to_be_printed += 1 recID_score_list_to_be_printed.append([recID, score]) # secondly print them: out = ''' <table><tr> <td> <table><tr><td class="blocknote">%(title)s</td></tr></table> </td> </tr> <tr> <td><table> ''' % { 'title': cgi.escape(title) } for recid, score in recID_score_list_to_be_printed: out += ''' <tr><td><font class="rankscoreinfo"><a>(%(score)s) </a></font><small> %(info)s</small></td></tr>''' % { 'score': score, 'info' : print_record(recid, format="hs", ln=ln), } out += """</table></td></tr></table> """ return out def tmpl_print_record_brief(self, ln, recID): """Displays a brief record on-the-fly Parameters: - 'ln' *string* - The language to display - 'recID' *int* - The record id """ out = "" # record 'recID' does not exist in format 'format', so print some default format: # firstly, title: titles = get_fieldvalues(recID, "245__a") # secondly, authors: authors = get_fieldvalues(recID, "100__a") + get_fieldvalues(recID, "700__a") # thirdly, date of creation: dates = get_fieldvalues(recID, "260__c") # thirdly bis, report numbers: rns = get_fieldvalues(recID, "037__a") rns = get_fieldvalues(recID, "088__a") # fourthly, beginning of abstract: abstracts = get_fieldvalues(recID, "520__a") # fifthly, fulltext link: urls_z = get_fieldvalues(recID, "8564_z") urls_u = get_fieldvalues(recID, "8564_u") ## unAPI identifier out = '<abbr class="unapi-id" title="%s"></abbr>\n' % recID out += self.tmpl_record_body( titles = titles, authors = authors, dates = dates, rns = rns, abstracts = abstracts, urls_u = urls_u, urls_z = urls_z, ln=ln) return out def tmpl_print_record_brief_links(self, ln, recID): """Displays links for brief record on-the-fly Parameters: - 'ln' *string* - The language to display - 'recID' *int* - The record id """ # load the right message language _ = gettext_set_language(ln) out = '<div class="moreinfo">' if CFG_WEBSEARCH_USE_ALEPH_SYSNOS: alephsysnos = get_fieldvalues(recID, "970__a") if len(alephsysnos)>0: alephsysno = alephsysnos[0] out += '<span class="moreinfo">%s</span>' % \ create_html_link(self.build_search_url(recid=alephsysno, ln=ln), {}, _("Detailed record"), {'class': "moreinfo"}) else: out += '<span class="moreinfo">%s</span>' % \ create_html_link(self.build_search_url(recid=recID, ln=ln), {}, _("Detailed record"), {'class': "moreinfo"}) else: out += '<span class="moreinfo">%s</span>' % \ create_html_link(self.build_search_url(recid=recID, ln=ln), {}, _("Detailed record"), {'class': "moreinfo"}) out += '<span class="moreinfo"> - %s</span>' % \ create_html_link(self.build_search_url(p="recid:%d" % recID, rm="wrd", ln=ln), {}, _("Similar records"), {'class': "moreinfo"}) if CFG_BIBRANK_SHOW_CITATION_LINKS: num_timescited = get_cited_by_count(recID) if num_timescited: out += '<span class="moreinfo"> - %s</span>' % \ create_html_link(self.build_search_url(p="recid:%d" % recID, rm="citation", ln=ln), {}, _("Cited by %i records") % num_timescited, {'class': "moreinfo"}) else: out+="<!--not showing citations links-->" out+='</div>' return out def tmpl_xml_rss_prologue(self, current_url=None, previous_url=None, next_url=None): """Creates XML RSS 2.0 prologue.""" out = """<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss" xmlns:atom="http://www.w3.org/2005/Atom"> <channel> <title>%(sitename)s</title> <link>%(siteurl)s</link> <description>%(sitename)s latest documents</description> <language>%(sitelang)s</language> <pubDate>%(timestamp)s</pubDate> <category></category> <generator>CDS Invenio %(version)s</generator> <webMaster>%(sitesupportemail)s</webMaster> <ttl>%(timetolive)s</ttl>%(previous_link)s%(next_link)s%(current_link)s <image> <url>%(siteurl)s/img/cds.png</url> <title>%(sitename)s</title> <link>%(siteurl)s</link> </image> <textInput> <title>Search </title> <description>Search this site:</description> <name>p</name> <link>%(siteurl)s/search</link> </textInput> """ % {'sitename': CFG_SITE_NAME, 'siteurl': CFG_SITE_URL, 'sitelang': CFG_SITE_LANG, 'timestamp': time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()), 'version': CFG_VERSION, 'sitesupportemail': CFG_SITE_SUPPORT_EMAIL, 'timetolive': CFG_WEBSEARCH_RSS_TTL, 'current_link': (current_url and \ '\n<atom:link rel="self" href="%s" />\n' % current_url) or '', 'previous_link': (previous_url and \ '\n<atom:link rel="previous" href="%s" />' % previous_url) or '', 'next_link': (next_url and \ '\n<atom:link rel="next" href="%s" />' % next_url) or '', } return out def tmpl_xml_rss_epilogue(self): """Creates XML RSS 2.0 epilogue.""" out = """\ </channel> </rss>\n""" return out def tmpl_xml_nlm_prologue(self): """Creates XML NLM prologue.""" out = """<articles>\n""" return out def tmpl_xml_nlm_epilogue(self): """Creates XML NLM epilogue.""" out = """\n</articles>""" return out def tmpl_xml_refworks_prologue(self): """Creates XML RefWorks prologue.""" out = """<references>\n""" return out def tmpl_xml_refworks_epilogue(self): """Creates XML RefWorks epilogue.""" out = """\n</references>""" return out def tmpl_xml_endnote_prologue(self): """Creates XML EndNote prologue.""" out = """<records>\n""" return out def tmpl_xml_endnote_epilogue(self): """Creates XML EndNote epilogue.""" out = """\n</records>""" return out def tmpl_xml_marc_prologue(self): """Creates XML MARC prologue.""" out = """<collection xmlns="http://www.loc.gov/MARC21/slim">\n""" return out def tmpl_xml_marc_epilogue(self): """Creates XML MARC epilogue.""" out = """\n</collection>""" return out def tmpl_xml_mods_prologue(self): """Creates XML MODS prologue.""" out = """<modsCollection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n xsi:schemaLocation="http://www.loc.gov/mods/v3\n http://www.loc.gov/standards/mods/v3/mods-3-3.xsd">\n""" return out def tmpl_xml_mods_epilogue(self): """Creates XML MODS epilogue.""" out = """\n</modsCollection>""" return out def tmpl_xml_default_prologue(self): """Creates XML default format prologue. (Sanity calls only.)""" out = """<collection>\n""" return out def tmpl_xml_default_epilogue(self): """Creates XML default format epilogue. (Sanity calls only.)""" out = """\n</collection>""" return out def tmpl_collection_not_found_page_title(self, colname, ln=CFG_SITE_LANG): """ Create page title for cases when unexisting collection was asked for. """ _ = gettext_set_language(ln) out = _("Collection %s Not Found") % cgi.escape(colname) return out def tmpl_collection_not_found_page_body(self, colname, ln=CFG_SITE_LANG): """ Create page body for cases when unexisting collection was asked for. """ _ = gettext_set_language(ln) out = """<h1>%(title)s</h1> <p>%(sorry)s</p> <p>%(you_may_want)s</p> """ % { 'title': self.tmpl_collection_not_found_page_title(colname, ln), 'sorry': _("Sorry, collection %s does not seem to exist.") % \ ('<strong>' + cgi.escape(colname) + '</strong>'), 'you_may_want': _("You may want to start browsing from %s.") % \ ('<a href="' + CFG_SITE_URL + '?ln=' + ln + '">' + \ cgi.escape(CFG_SITE_NAME_INTL.get(ln, CFG_SITE_NAME)) + '</a>')} return out def tmpl_alert_rss_teaser_box_for_query(self, id_query, ln): """Propose teaser for setting up this query as alert or RSS feed. Parameters: - 'id_query' *int* - ID of the query we make teaser for - 'ln' *string* - The language to display """ # load the right message language _ = gettext_set_language(ln) # get query arguments: res = run_sql("SELECT urlargs FROM query WHERE id=%s", (id_query,)) argd = {} if res: argd = cgi.parse_qs(res[0][0]) rssurl = self.build_rss_url(argd) alerturl = CFG_SITE_URL + '/youralerts/input?ln=%s&idq=%s' % (ln, id_query) out = '''<a name="googlebox"></a> <table class="googlebox"><tr><th class="googleboxheader">%(similar)s</th></tr> <tr><td class="googleboxbody">%(msg_alert)s</td></tr> </table> ''' % { 'similar' : _("Interested in being notified about new results for this query?"), 'msg_alert': _("""Set up a personal %(x_url1_open)semail alert%(x_url1_close)s or subscribe to the %(x_url2_open)sRSS feed%(x_url2_close)s.""") % \ {'x_url1_open': '<a href="%s"><img src="%s/img/mail-icon-12x8.gif" border="0" alt="" /></a> ' % (alerturl, CFG_SITE_URL) + ' <a class="google" href="%s">' % (alerturl), 'x_url1_close': '</a>', 'x_url2_open': '<a href="%s"><img src="%s/img/feed-icon-12x12.gif" border="0" alt="" /></a> ' % (rssurl, CFG_SITE_URL) + ' <a class="google" href="%s">' % rssurl, 'x_url2_close': '</a>', }} return out def tmpl_detailed_record_metadata(self, recID, ln, format, content, creationdate=None, modificationdate=None): """Returns the main detailed page of a record Parameters: - 'recID' *int* - The ID of the printed record - 'ln' *string* - The language to display - 'format' *string* - The format in used to print the record - 'content' *string* - The main content of the page - 'creationdate' *string* - The creation date of the printed record - 'modificationdate' *string* - The last modification date of the printed record """ _ = gettext_set_language(ln) ## unAPI identifier out = '<abbr class="unapi-id" title="%s"></abbr>\n' % recID out += content return out def tmpl_detailed_record_statistics(self, recID, ln, downloadsimilarity, downloadhistory, viewsimilarity): """Returns the statistics page of a record Parameters: - 'recID' *int* - The ID of the printed record - 'ln' *string* - The language to display - downloadsimilarity *string* - downloadsimilarity box - downloadhistory *string* - downloadhistory box - viewsimilarity *string* - viewsimilarity box """ # load the right message language _ = gettext_set_language(ln) out = '' if CFG_BIBRANK_SHOW_DOWNLOAD_STATS and downloadsimilarity is not None: similar = self.tmpl_print_record_list_for_similarity_boxen ( _("People who downloaded this document also downloaded:"), downloadsimilarity, ln) out = '<table>' out += ''' <tr><td>%(graph)s</td></tr> <tr><td>%(similar)s</td></tr> ''' % { 'siteurl': CFG_SITE_URL, 'recid': recID, 'ln': ln, 'similar': similar, 'more': _("more"), 'graph': downloadsimilarity } out += '</table>' out += '<br />' if CFG_BIBRANK_SHOW_READING_STATS and viewsimilarity is not None: out += self.tmpl_print_record_list_for_similarity_boxen ( _("People who viewed this page also viewed:"), viewsimilarity, ln) if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS and downloadhistory is not None: out += downloadhistory + '<br />' return out def tmpl_detailed_record_citations_prologue(self, recID, ln): """Returns the prologue of the citations page of a record Parameters: - 'recID' *int* - The ID of the printed record - 'ln' *string* - The language to display """ return '<table>' def tmpl_detailed_record_citations_epilogue(self, recID, ln): """Returns the epilogue of the citations page of a record Parameters: - 'recID' *int* - The ID of the printed record - 'ln' *string* - The language to display """ return '</table>' def tmpl_detailed_record_citations_citing_list(self, recID, ln, citinglist): """Returns the list of record citing this one Parameters: - 'recID' *int* - The ID of the printed record - 'ln' *string* - The language to display - citinglist *list* - a list of tuples [(x1,y1),(x2,y2),..] where x is doc id and y is number of citations """ # load the right message language _ = gettext_set_language(ln) out = '' if CFG_BIBRANK_SHOW_CITATION_STATS and citinglist is not None: similar = self.tmpl_print_record_list_for_similarity_boxen( _("Cited by: %s records") % len (citinglist), citinglist, ln) out += ''' <tr><td> %(similar)s %(more)s <br /><br /> </td></tr>''' % { 'more': create_html_link( self.build_search_url(p='recid:%d' % \ recID, #XXXX rm='citation', ln=ln), {}, _("more")), 'similar': similar} return out def tmpl_detailed_record_citations_citation_history(self, recID, ln, citationhistory): """Returns the citations history graph of this record Parameters: - 'recID' *int* - The ID of the printed record - 'ln' *string* - The language to display - citationhistory *string* - citationhistory box """ # load the right message language _ = gettext_set_language(ln) out = '' if CFG_BIBRANK_SHOW_CITATION_GRAPHS and citationhistory is not None: out = '<!--citation history--><tr><td>%s</td></tr>' % citationhistory else: out = "<!--not showing citation history. CFG_BIBRANK_SHOW_CITATION_GRAPHS:" out+= str(CFG_BIBRANK_SHOW_CITATION_GRAPHS)+" citationhistory " if citationhistory: out+= str(len(citationhistory))+"-->" else: out+= "no citationhistory -->" return out def tmpl_detailed_record_citations_co_citing(self, recID, ln, cociting): """Returns the list of cocited records Parameters: - 'recID' *int* - The ID of the printed record - 'ln' *string* - The language to display - cociting *string* - cociting box """ # load the right message language _ = gettext_set_language(ln) out = '' if CFG_BIBRANK_SHOW_CITATION_STATS and cociting is not None: similar = self.tmpl_print_record_list_for_similarity_boxen ( _("Co-cited with: %s records") % len (cociting), cociting, ln) out = ''' <tr><td> %(similar)s %(more)s <br /> </td></tr>''' % { 'more': create_html_link(self.build_search_url(p='cocitedwith:%d' % recID, ln=ln), {}, _("more")), 'similar': similar } return out def tmpl_detailed_record_citations_self_cited(self, recID, ln, selfcited, citinglist): """Returns the list of self-citations for this record Parameters: - 'recID' *int* - The ID of the printed record - 'ln' *string* - The language to display - selfcited list - a list of self-citations for recID """ # load the right message language _ = gettext_set_language(ln) out = '' if CFG_BIBRANK_SHOW_CITATION_GRAPHS and selfcited is not None: sc_scorelist = [] #a score list for print.. for s in selfcited: #copy weight from citations weight = 0 for c in citinglist: (crec, score) = c if crec == s: weight = score tmp = [s, weight] sc_scorelist.append(tmp) scite = self.tmpl_print_record_list_for_similarity_boxen ( _(".. of which self-citations: %s records") % len (selfcited), sc_scorelist, ln) out = '<tr><td>'+scite+'</td></tr>' return out def tmpl_author_information(self, req, pubs, authorname, num_downloads, aff_pubdict, citedbylist, kwtuples, authors, vtuples, ln): """Prints stuff about the author given as authorname. 1. Author name + his/her institutes. Each institute I has a link to papers where the auhtor has I as institute. 2. Publications, number: link to search by author. 3. Keywords 4. Author collabs 5. Publication venues like journals The parameters are data structures needed to produce 1-6, as follows: req - request pubs - list of recids, probably the records that have the author as an author authorname - evident num_downloads - evident aff_pubdict - a dictionary where keys are inst names and values lists of recordids citedbylist - list of recs that cite pubs kwtuples - keyword tuples like ('HIGGS BOSON',[3,4]) where 3 and 4 are recids authors - a list of authors that have collaborated with authorname """ from invenio.search_engine import perform_request_search _ = gettext_set_language(ln) #make a authoraff string that looks like CERN (1), Caltech (2) etc authoraff = "" for a in aff_pubdict.keys(): recids = "+or+".join(map(str, aff_pubdict[a])) print_a = a if (print_a == ' '): print_a = _("unknown") if authoraff: authoraff += '<br>' authoraff += "<a href=\"../search?f=recid&p="+recids+"\">"+print_a+' ('+str(len(aff_pubdict[a]))+")</a>" #print a "general" banner about the author req.write("<h1>" + authorname + "</h1>") #print affiliations line1 = "<strong>" + _("Affiliations:") + "</strong>" line2 = authoraff req.write(self.tmpl_print_searchresultbox(line1, line2)) # print frequent keywords: keywstr = "" if (kwtuples): for (kw, freq) in kwtuples: if keywstr: keywstr += '<br>' #create a link in author=x, keyword=y searchstr = create_html_link(self.build_search_url( p='author:"' + authorname + '" ' + 'keyword:"' + kw + '"'), {}, kw+" ("+str(freq)+")",) keywstr = keywstr+" "+searchstr banner = self.tmpl_print_searchresultbox("<strong>" + _("Frequent keywords:") + "</strong>", keywstr) req.write(banner) # print frequent co-authors: collabstr = "" if (authors): for c in authors: c = c.strip() if collabstr: collabstr += '<br>' #do not add this person him/herself in the list if not c == authorname: commpubs = intbitset(pubs) & intbitset(perform_request_search(p="author:\"%s\" author:\"%s\"" % (authorname, c))) collabstr = collabstr + " <a href=\"/author/"+c+"\">"+c+" ("+str(len(commpubs))+")</a>" banner = self.tmpl_print_searchresultbox("<strong>" + _("Frequent co-authors:") + "</strong>", collabstr) req.write(banner) # print frequently publishes in journals: if (vtuples): pubinfo = "" for t in vtuples: (journal, num) = t pubinfo += create_html_link(self.build_search_url(p='author:"' + authorname + '" ' + \ 'journal:"' + journal + '"'), {}, journal + " ("+str(num)+")<br/>") banner = self.tmpl_print_searchresultbox("<strong>" + _("Frequently publishes in:") + "<strong>", pubinfo) req.write(banner) # print papers: searchstr = create_html_link(self.build_search_url(p=authorname, f='author'), {}, "All papers ("+str(len(pubs))+")",) line1 = "<strong>" + _("Papers:") + "</strong>" line2 = searchstr if num_downloads: line2 + " ("+_("downloaded")+" " line2 += str(num_downloads)+" "+_("times")+")" from invenio.search_engine import perform_request_search if CFG_INSPIRE_SITE: CFG_COLLS = ['Book', 'Conference', 'Introductory', 'Lectures', 'Preprint', 'Published', 'Report', 'Review', 'Thesis'] else: CFG_COLLS = ['Article', 'Book', 'Preprint',] for coll in CFG_COLLS: collsearch = intbitset(pubs) & intbitset(perform_request_search(p="collection:"+coll)) if len(collsearch) > 0: num = len(collsearch) line2 += "<br>" + create_html_link(self.build_search_url(p='author:"' + authorname + '" ' + \ 'collection:' + coll), {}, coll + " ("+str(num)+")",) banner = self.tmpl_print_searchresultbox(line1, line2) req.write(banner) # print citations: if len(citedbylist): line1 = "<strong>" + _("Citations:") + "</strong>" line2 = "" req.write(self.tmpl_print_searchresultbox(line1, line2)) # they will be printed after that def tmpl_detailed_record_references(self, recID, ln, content): """Returns the discussion page of a record Parameters: - 'recID' *int* - The ID of the printed record - 'ln' *string* - The language to display - 'content' *string* - The main content of the page """ # load the right message language _ = gettext_set_language(ln) out = '' if content is not None: out += content return out def tmpl_citesummary_prologue(self, d_total_recs, l_colls, searchpattern, searchfield, ln=CFG_SITE_LANG): """HTML citesummary format, prologue. A part of HCS format suite.""" _ = gettext_set_language(ln) out = """<p><table id="citesummary"> <tr><td><strong class="headline">%(msg_title)s</strong></td>""" % \ {'msg_title': _("Citation summary results"),} for coll, colldef in l_colls: out += '<td align="right">%s</td>' % coll out += '</tr>' out += """<tr><td><strong>%(msg_recs)s</strong></td>""" % \ {'msg_recs': _("Total number of citable papers analyzed:"),} for coll, colldef in l_colls: link_url = CFG_SITE_URL + '/search?p=' if searchpattern: p = searchpattern if searchfield: if " " in searchpattern: p = searchfield + ':"' + searchpattern + '"' else: p = searchfield + ':' + searchpattern link_url += quote(p) if colldef: link_url += ' ' + quote(colldef) link_url += '&rm=citation'; link_text = self.tmpl_nice_number(d_total_recs[coll], ln) out += '<td align="right"><a href="%s">%s</a></td>' % (link_url, link_text) out += '</tr>' return out def tmpl_citesummary_overview(self, d_total_cites, d_avg_cites, l_colls, ln=CFG_SITE_LANG): """HTML citesummary format, overview. A part of HCS format suite.""" _ = gettext_set_language(ln) out = """<tr><td><strong>%(msg_cites)s</strong></td>""" % \ {'msg_cites': _("Total number of citations:"),} for coll, colldef in l_colls: out += '<td align="right">%s</td>' % self.tmpl_nice_number(d_total_cites[coll], ln) out += '</tr>' out += """<tr><td><strong>%(msg_avgcit)s</strong></td>""" % \ {'msg_avgcit': _("Average citations per paper:"),} for coll, colldef in l_colls: out += '<td align="right">%.1f</td>' % d_avg_cites[coll] out += '</tr>' out += """<tr><td><strong>%(msg_breakdown)s</strong></td></tr>""" % \ {'msg_breakdown': _("Breakdown of papers by citations:"),} return out def tmpl_citesummary_breakdown_by_fame(self, d_cites, low, high, fame, l_colls, searchpattern, searchfield, ln=CFG_SITE_LANG): """HTML citesummary format, breakdown by fame. A part of HCS format suite.""" _ = gettext_set_language(ln) out = """<tr><td>%(fame)s</td>""" % \ {'fame': fame,} for coll, colldef in l_colls: link_url = CFG_SITE_URL + '/search?p=' if searchpattern: p = searchpattern if searchfield: if " " in searchpattern: p = searchfield + ':"' + searchpattern + '"' else: p = searchfield + ':' + searchpattern link_url += quote(p) + ' ' if colldef: link_url += quote(colldef) + ' ' if low == 0 and high == 0: link_url += quote('cited:0') else: link_url += quote('cited:%i->%i' % (low, high)) link_url += '&rm=citation'; link_text = self.tmpl_nice_number(d_cites[coll], ln) out += '<td align="right"><a href="%s">%s</a></td>' % (link_url, link_text) out += '</tr>' return out def tmpl_citesummary_epilogue(self, ln=CFG_SITE_LANG): """HTML citesummary format, epilogue. A part of HCS format suite.""" _ = gettext_set_language(ln) out = """</table>""" return out def tmpl_unapi(self, formats, identifier=None): """ Provide a list of object format available from the unAPI service for the object identified by IDENTIFIER """ out = '<?xml version="1.0" encoding="UTF-8" ?>\n' if identifier: out += '<formats id="%i">\n' % (identifier) else: out += "<formats>\n" for format_name, format_type in formats.iteritems(): docs = '' if format_name == 'xn': docs = 'http://www.nlm.nih.gov/databases/dtd/' format_type = 'application/xml' format_name = 'nlm' elif format_name == 'xm': docs = 'http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd' format_type = 'application/xml' format_name = 'marcxml' elif format_name == 'xr': format_type = 'application/rss+xml' docs = 'http://www.rssboard.org/rss-2-0/' elif format_name == 'xw': format_type = 'application/xml' docs = 'http://www.refworks.com/RefWorks/help/RefWorks_Tagged_Format.htm' elif format_name == 'xoaidc': format_type = 'application/xml' docs = 'http://www.openarchives.org/OAI/2.0/oai_dc.xsd' elif format_name == 'xe': format_type = 'application/xml' docs= 'http://www.endnote.com/support/' format_name = 'endnote' elif format_name == 'xd': format_type = 'application/xml' docs = 'http://dublincore.org/schemas/' format_name = 'dc' elif format_name == 'xo': format_type = 'application/xml' docs = 'http://www.loc.gov/standards/mods/v3/mods-3-3.xsd' format_name = 'mods' if docs: out += '<format name="%s" type="%s" docs="%s" />\n' % (xml_escape(format_name), xml_escape(format_type), xml_escape(docs)) else: out += '<format name="%s" type="%s" />\n' % (xml_escape(format_name), xml_escape(format_type)) out += "</formats>" return out diff --git a/modules/websearch/lib/websearch_webcoll.py b/modules/websearch/lib/websearch_webcoll.py index 51a3e4830..9697917d4 100644 --- a/modules/websearch/lib/websearch_webcoll.py +++ b/modules/websearch/lib/websearch_webcoll.py @@ -1,935 +1,935 @@ ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Create CDS Invenio collection cache.""" __revision__ = "$Id$" import calendar import copy import sys import cgi import re import os import string import time from invenio.config import \ CFG_CERN_SITE, \ CFG_WEBSEARCH_INSTANT_BROWSE, \ CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, \ CFG_WEBSEARCH_I18N_LATEST_ADDITIONS, \ CFG_CACHEDIR, \ CFG_SITE_LANG, \ CFG_SITE_NAME, \ CFG_SITE_URL, \ CFG_SITE_LANGS, \ CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES, \ CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE from invenio.messages import gettext_set_language, language_list_long from invenio.search_engine import HitSet, search_pattern, get_creation_date, get_field_i18nname, collection_restricted_p from invenio.dbquery import run_sql, Error, get_table_update_time from invenio.access_control_engine import acc_authorize_action from invenio.bibrank_record_sorter import get_bibrank_methods from invenio.dateutils import convert_datestruct_to_dategui from invenio.bibformat import format_record from invenio.websearch_external_collections import \ external_collection_load_states, \ dico_collection_external_searches, \ external_collection_sort_engine_by_name from invenio.bibtask import task_init, task_get_option, task_set_option, \ write_message, task_has_option, task_update_progress, \ task_sleep_now_if_required import invenio.template websearch_templates = invenio.template.load('websearch') ## global vars collection_house = {} # will hold collections we treat in this run of the program; a dict of {collname2, collobject1}, ... # cfg_cache_last_updated_timestamp_tolerance -- cache timestamp # tolerance (in seconds), to account for the fact that an admin might # accidentally happen to edit the collection definitions at exactly # the same second when some webcoll process was about to be started. # In order to be safe, let's put an exaggerated timestamp tolerance # value such as 20 seconds: cfg_cache_last_updated_timestamp_tolerance = 20 # cfg_cache_last_updated_timestamp_file -- location of the cache # timestamp file: cfg_cache_last_updated_timestamp_file = "%s/collections/last_updated" % CFG_CACHEDIR def get_collection(colname): """Return collection object from the collection house for given colname. If does not exist, then create it.""" if not collection_house.has_key(colname): colobject = Collection(colname) collection_house[colname] = colobject return collection_house[colname] ## auxiliary functions: def mymkdir(newdir, mode=0777): """works the way a good mkdir should :) - already exists, silently complete - regular file in the way, raise an exception - parent directory(ies) does not exist, make them as well """ if os.path.isdir(newdir): pass elif os.path.isfile(newdir): raise OSError("a file with the same name as the desired " \ "dir, '%s', already exists." % newdir) else: head, tail = os.path.split(newdir) if head and not os.path.isdir(head): mymkdir(head, mode) if tail: os.umask(022) os.mkdir(newdir, mode) def is_selected(var, fld): "Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes." if var == fld: return ' selected="selected"' else: return "" def get_field(recID, tag): "Gets list of field 'tag' for the record with 'recID' system number." out = [] digit = tag[0:2] bx = "bib%sx" % digit bibx = "bibrec_bib%sx" % digit query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag='%s'" \ % (bx, bibx, recID, tag) res = run_sql(query) for row in res: out.append(row[0]) return out class Collection: "Holds the information on collections (id,name,dbquery)." def __init__(self, name=""): "Creates collection instance by querying the DB configuration database about 'name'." self.calculate_reclist_run_already = 0 # to speed things up without much refactoring self.update_reclist_run_already = 0 # to speed things up without much refactoring self.reclist_with_nonpublic_subcolls = HitSet() if not name: self.name = CFG_SITE_NAME # by default we are working on the home page self.id = 1 self.dbquery = None self.nbrecs = None self.reclist = HitSet() else: self.name = name try: res = run_sql("""SELECT id,name,dbquery,nbrecs,reclist FROM collection WHERE name=%s""", (name,)) if res: self.id = res[0][0] self.name = res[0][1] self.dbquery = res[0][2] self.nbrecs = res[0][3] try: self.reclist = HitSet(res[0][4]) except: self.reclist = HitSet() else: # collection does not exist! self.id = None self.dbquery = None self.nbrecs = None self.reclist = HitSet() except Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1) def get_example_search_queries(self): """Returns list of sample search queries for this collection. """ res = run_sql("""SELECT example.body FROM example LEFT JOIN collection_example on example.id=collection_example.id_example WHERE collection_example.id_collection=%s ORDER BY collection_example.score""", (self.id,)) return [query[0] for query in res] def get_name(self, ln=CFG_SITE_LANG, name_type="ln", prolog="", epilog="", prolog_suffix=" ", epilog_suffix=""): """Return nicely formatted collection name for language LN. The NAME_TYPE may be 'ln' (=long name), 'sn' (=short name), etc.""" out = prolog i18name = "" res = run_sql("SELECT value FROM collectionname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, name_type)) try: i18name += res[0][0] except IndexError: pass if i18name: out += i18name else: out += self.name out += epilog return out def get_ancestors(self): "Returns list of ancestors of the current collection." ancestors = [] id_son = self.id while 1: query = "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_son=%d AND c.id=cc.id_dad" % int(id_son) res = run_sql(query, None, 1) if res: col_ancestor = get_collection(res[0][1]) ancestors.append(col_ancestor) id_son = res[0][0] else: break ancestors.reverse() return ancestors def restricted_p(self): """Predicate to test if the collection is restricted or not. Return the contect of the `restrited' column of the collection table (typically Apache group). Otherwise return None if the collection is public.""" if collection_restricted_p(self.name): return 1 return None def get_sons(self, type='r'): "Returns list of direct sons of type 'type' for the current collection." sons = [] id_dad = self.id query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC, c.name ASC" % (int(id_dad), type) res = run_sql(query) for row in res: sons.append(get_collection(row[1])) return sons def get_descendants(self, type='r'): "Returns list of all descendants of type 'type' for the current collection." descendants = [] id_dad = self.id query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC" % (int(id_dad), type) res = run_sql(query) for row in res: col_desc = get_collection(row[1]) descendants.append(col_desc) descendants += col_desc.get_descendants() return descendants def write_cache_file(self, filename='', filebody=''): "Write a file inside collection cache." # open file: dirname = "%s/collections/%d" % (CFG_CACHEDIR, self.id) mymkdir(dirname) fullfilename = dirname + "/%s.html" % filename try: os.umask(022) f = open(fullfilename, "w") except IOError, v: try: (code, message) = v except: code = 0 message = v print "I/O Error: " + str(message) + " (" + str(code) + ")" sys.exit(1) # print user info: write_message("... creating %s" % fullfilename, verbose=6) sys.stdout.flush() # print page body: f.write(filebody) # close file: f.close() def update_webpage_cache(self): """Create collection page header, navtrail, body (including left and right stripes) and footer, and call write_cache_file() afterwards to update the collection webpage cache.""" ## precalculate latest additions for non-aggregate ## collections (the info is ln and as independent) if self.dbquery and not CFG_WEBSEARCH_I18N_LATEST_ADDITIONS: self.create_latest_additions_info() ## do this for each language: for lang, lang_fullname in language_list_long(): # but only if some concrete language was not chosen only: if lang in task_get_option("language", [lang]): if self.dbquery and CFG_WEBSEARCH_I18N_LATEST_ADDITIONS: self.create_latest_additions_info(ln=lang) # load the right message language _ = gettext_set_language(lang) ## first, update navtrail: - for as in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: - self.write_cache_file("navtrail-as=%s-ln=%s" % (as, lang), - self.create_navtrail_links(as, lang)) + for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: + self.write_cache_file("navtrail-as=%s-ln=%s" % (aas, lang), + self.create_navtrail_links(aas, lang)) ## second, update page body: - for as in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: # do light, simple and advanced search pages: + for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: # do light, simple and advanced search pages: body = websearch_templates.tmpl_webcoll_body( ln=lang, collection=self.name, te_portalbox = self.create_portalbox(lang, 'te'), - searchfor = self.create_searchfor(as, lang), + searchfor = self.create_searchfor(aas, lang), np_portalbox = self.create_portalbox(lang, 'np'), - narrowsearch = self.create_narrowsearch(as, lang, 'r'), - focuson = self.create_narrowsearch(as, lang, "v") + \ + narrowsearch = self.create_narrowsearch(aas, lang, 'r'), + focuson = self.create_narrowsearch(aas, lang, "v") + \ self.create_external_collections_box(lang), - instantbrowse = self.create_instant_browse(as=as, ln=lang), + instantbrowse = self.create_instant_browse(aas=aas, ln=lang), ne_portalbox = self.create_portalbox(lang, 'ne') ) - self.write_cache_file("body-as=%s-ln=%s" % (as, lang), body) + self.write_cache_file("body-as=%s-ln=%s" % (aas, lang), body) ## third, write portalboxes: self.write_cache_file("portalbox-tp-ln=%s" % lang, self.create_portalbox(lang, "tp")) self.write_cache_file("portalbox-te-ln=%s" % lang, self.create_portalbox(lang, "te")) self.write_cache_file("portalbox-lt-ln=%s" % lang, self.create_portalbox(lang, "lt")) self.write_cache_file("portalbox-rt-ln=%s" % lang, self.create_portalbox(lang, "rt")) ## fourth, write 'last updated' information: self.write_cache_file("last-updated-ln=%s" % lang, convert_datestruct_to_dategui(time.localtime(), ln=lang)) return - def create_navtrail_links(self, as=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): + def create_navtrail_links(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): """Creates navigation trail links, i.e. links to collection - ancestors (except Home collection). If as==1, then links to + ancestors (except Home collection). If aas==1, then links to Advanced Search interfaces; otherwise Simple Search. """ dads = [] for dad in self.get_ancestors(): if dad.name != CFG_SITE_NAME: # exclude Home collection dads.append((dad.name, dad.get_name(ln))) return websearch_templates.tmpl_navtrail_links( - as=as, ln=ln, dads=dads) + aas=aas, ln=ln, dads=dads) def create_portalbox(self, lang=CFG_SITE_LANG, position="rt"): """Creates portalboxes of language CFG_SITE_LANG of the position POSITION by consulting DB configuration database. The position may be: 'lt'='left top', 'rt'='right top', etc.""" out = "" query = "SELECT p.title,p.body FROM portalbox AS p, collection_portalbox AS cp "\ " WHERE cp.id_collection=%d AND p.id=cp.id_portalbox AND cp.ln='%s' AND cp.position='%s' "\ " ORDER BY cp.score DESC" % (self.id, lang, position) res = run_sql(query) for row in res: title, body = row[0], row[1] if title: out += websearch_templates.tmpl_portalbox(title = title, body = body) else: # no title specified, so print body ``as is'' only: out += body return out - def create_narrowsearch(self, as=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG, type="r"): + def create_narrowsearch(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG, type="r"): """Creates list of collection descendants of type 'type' under title 'title'. - If as==1, then links to Advanced Search interfaces; otherwise Simple Search. + If aas==1, then links to Advanced Search interfaces; otherwise Simple Search. Suitable for 'Narrow search' and 'Focus on' boxes.""" # get list of sons and analyse it sons = self.get_sons(type) if not sons: return '' # get descendents descendants = self.get_descendants(type) grandsons = [] if CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS: # load grandsons for each son for son in sons: grandsons.append(son.get_sons()) # return "" return websearch_templates.tmpl_narrowsearch( - as = as, + aas = aas, ln = ln, type = type, father = self, has_grandchildren = len(descendants)>len(sons), sons = sons, display_grandsons = CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, grandsons = grandsons ) def create_external_collections_box(self, ln=CFG_SITE_LANG): external_collection_load_states() if not dico_collection_external_searches.has_key(self.id): return "" engines_list = external_collection_sort_engine_by_name(dico_collection_external_searches[self.id]) return websearch_templates.tmpl_searchalso(ln, engines_list, self.id) def create_latest_additions_info(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, ln=CFG_SITE_LANG): """ Create info about latest additions that will be used for create_instant_browse() later. """ self.latest_additions_info = [] if self.nbrecs and self.reclist: # firstly, get last 'rg' records: recIDs = list(self.reclist) # FIXME: temporary hack in order to display tweaked latest # additions box for some CERN collections: if CFG_CERN_SITE: this_year = time.strftime("%Y", time.localtime()) if self.name in ['CERN Yellow Reports']: last_year = str(int(this_year) - 1) # detect recIDs only from this and past year: recIDs = list(self.reclist & \ search_pattern(p='year:%s or year:%s' % \ (this_year, last_year))) elif self.name in ['Videos']: # detect recIDs only from this year: recIDs = list(self.reclist & \ search_pattern(p='year:%s' % this_year)) total = len(recIDs) to_display = min(rg, total) for idx in range(total-1, total-to_display-1, -1): recid = recIDs[idx] self.latest_additions_info.append({'id': recid, 'format': format_record(recid, "hb", ln=ln), 'date': get_creation_date(recid, fmt="%Y-%m-%d<br />%H:%i")}) return - def create_instant_browse(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, as=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): + def create_instant_browse(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): "Searches database and produces list of last 'rg' records." if self.restricted_p(): return websearch_templates.tmpl_box_restricted_content(ln = ln) if rg == 0: # do not show latest additions box return "" # FIXME: temporary hack in order not to display latest # additions box for some CERN collections: if CFG_CERN_SITE and self.name in ['Periodicals', 'Electronic Journals']: return "" try: self.latest_additions_info latest_additions_info_p = True except: latest_additions_info_p = False if latest_additions_info_p: passIDs = [] for idx in range(0, min(len(self.latest_additions_info), rg)): passIDs.append({'id': self.latest_additions_info[idx]['id'], 'body': self.latest_additions_info[idx]['format'] + \ websearch_templates.tmpl_record_links(recid=self.latest_additions_info[idx]['id'], ln=ln), 'date': self.latest_additions_info[idx]['date']}) if self.nbrecs > rg: url = websearch_templates.build_search_url( - cc=self.name, jrec=rg+1, ln=ln, as=as) + cc=self.name, jrec=rg+1, ln=ln, aas=aas) else: url = "" return websearch_templates.tmpl_instant_browse( - as=as, ln=ln, recids=passIDs, more_link=url) + aas=aas, ln=ln, recids=passIDs, more_link=url) return websearch_templates.tmpl_box_no_records(ln=ln) def create_searchoptions(self): "Produces 'Search options' portal box." box = "" query = """SELECT DISTINCT(cff.id_field),f.code,f.name FROM collection_field_fieldvalue AS cff, field AS f WHERE cff.id_collection=%d AND cff.id_fieldvalue IS NOT NULL AND cff.id_field=f.id ORDER BY cff.score DESC""" % self.id res = run_sql(query) if res: for row in res: field_id = row[0] field_code = row[1] field_name = row[2] query_bis = """SELECT fv.value,fv.name FROM fieldvalue AS fv, collection_field_fieldvalue AS cff WHERE cff.id_collection=%d AND cff.type='seo' AND cff.id_field=%d AND fv.id=cff.id_fieldvalue ORDER BY cff.score_fieldvalue DESC, cff.score DESC, fv.name ASC""" % (self.id, field_id) res_bis = run_sql(query_bis) if res_bis: values = [{'value' : '', 'text' : 'any' + ' ' + field_name}] # FIXME: internationalisation of "any" for row_bis in res_bis: values.append({'value' : cgi.escape(row_bis[0], 1), 'text' : row_bis[1]}) box += websearch_templates.tmpl_select( fieldname = field_code, values = values ) return box def create_sortoptions(self, ln=CFG_SITE_LANG): """Produces 'Sort options' portal box.""" # load the right message language _ = gettext_set_language(ln) box = "" query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE id_collection=%d AND cff.type='soo' AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""" % self.id values = [{'value' : '', 'text': "- %s -" % _("latest first")}] res = run_sql(query) if res: for row in res: values.append({'value' : row[0], 'text': row[1]}) else: for tmp in ('title', 'author', 'report number', 'year'): values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) box = websearch_templates.tmpl_select( fieldname = 'sf', css_class = 'address', values = values ) box += websearch_templates.tmpl_select( fieldname = 'so', css_class = 'address', values = [ {'value' : 'a' , 'text' : _("asc.")}, {'value' : 'd' , 'text' : _("desc.")} ] ) return box def create_rankoptions(self, ln=CFG_SITE_LANG): "Produces 'Rank options' portal box." # load the right message language _ = gettext_set_language(ln) values = [{'value' : '', 'text': "- %s %s -" % (string.lower(_("OR")), _("rank by"))}] for (code, name) in get_bibrank_methods(self.id, ln): values.append({'value' : code, 'text': name}) box = websearch_templates.tmpl_select( fieldname = 'rm', css_class = 'address', values = values ) return box def create_displayoptions(self, ln=CFG_SITE_LANG): "Produces 'Display options' portal box." # load the right message language _ = gettext_set_language(ln) values = [] for i in ['10', '25', '50', '100', '250', '500']: values.append({'value' : i, 'text' : i + ' ' + _("results")}) box = websearch_templates.tmpl_select( fieldname = 'rg', css_class = 'address', values = values ) if self.get_sons(): box += websearch_templates.tmpl_select( fieldname = 'sc', css_class = 'address', values = [ {'value' : '1' , 'text' : _("split by collection")}, {'value' : '0' , 'text' : _("single list")} ] ) return box def create_formatoptions(self, ln=CFG_SITE_LANG): "Produces 'Output format options' portal box." # load the right message language _ = gettext_set_language(ln) box = "" values = [] query = """SELECT f.code,f.name FROM format AS f, collection_format AS cf WHERE cf.id_collection=%d AND cf.id_format=f.id AND f.visibility='1' ORDER BY cf.score DESC, f.name ASC""" % self.id res = run_sql(query) if res: for row in res: values.append({'value' : row[0], 'text': row[1]}) else: values.append({'value' : 'hb', 'text' : "HTML %s" % _("brief")}) box = websearch_templates.tmpl_select( fieldname = 'of', css_class = 'address', values = values ) return box def create_searchwithin_selection_box(self, fieldname='f', value='', ln='en'): """Produces 'search within' selection box for the current collection.""" # get values query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='sew' AND cff.id_collection=%d AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""" % self.id res = run_sql(query) values = [{'value' : '', 'text' : get_field_i18nname("any field", ln)}] if res: for row in res: values.append({'value' : row[0], 'text' : get_field_i18nname(row[1], ln)}) else: if CFG_CERN_SITE: for tmp in ['title', 'author', 'abstract', 'report number', 'year']: values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) else: for tmp in ['title', 'author', 'abstract', 'keyword', 'report number', 'journal', 'year', 'fulltext', 'reference']: values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) return websearch_templates.tmpl_searchwithin_select( fieldname = fieldname, ln = ln, selected = value, values = values ) def create_searchexample(self): "Produces search example(s) for the current collection." out = "$collSearchExamples = getSearchExample(%d, $se);" % self.id return out - def create_searchfor(self, as=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): + def create_searchfor(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): "Produces either Simple or Advanced 'Search for' box for the current collection." - if as == 1: + if aas == 1: return self.create_searchfor_advanced(ln) - elif as == 0: + elif aas == 0: return self.create_searchfor_simple(ln) else: return self.create_searchfor_light(ln) def create_searchfor_light(self, ln=CFG_SITE_LANG): "Produces light 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_light( ln=ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, example_search_queries=self.get_example_search_queries(), ) def create_searchfor_simple(self, ln=CFG_SITE_LANG): "Produces simple 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_simple( ln=ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, middle_option = self.create_searchwithin_selection_box(ln=ln), ) def create_searchfor_advanced(self, ln=CFG_SITE_LANG): "Produces advanced 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_advanced( ln = ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, middle_option_1 = self.create_searchwithin_selection_box('f1', ln=ln), middle_option_2 = self.create_searchwithin_selection_box('f2', ln=ln), middle_option_3 = self.create_searchwithin_selection_box('f3', ln=ln), searchoptions = self.create_searchoptions(), sortoptions = self.create_sortoptions(ln), rankoptions = self.create_rankoptions(ln), displayoptions = self.create_displayoptions(ln), formatoptions = self.create_formatoptions(ln) ) def calculate_reclist(self): """Calculate, set and return the (reclist, reclist_with_nonpublic_subcolls) tuple for given collection.""" if self.calculate_reclist_run_already: # do we have to recalculate? return (self.reclist, self.reclist_with_nonpublic_subcolls) write_message("... calculating reclist of %s" % self.name, verbose=6) reclist = HitSet() # will hold results for public sons only; good for storing into DB reclist_with_nonpublic_subcolls = HitSet() # will hold results for both public and nonpublic sons; good for deducing total # number of documents if not self.dbquery: # A - collection does not have dbquery, so query recursively all its sons # that are either non-restricted or that have the same restriction rules for coll in self.get_sons(): coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist() if ((coll.restricted_p() is None) or (coll.restricted_p() == self.restricted_p())): # add this reclist ``for real'' only if it is public reclist.union_update(coll_reclist) reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls) else: # B - collection does have dbquery, so compute it: # (note: explicitly remove DELETED records) if CFG_CERN_SITE: reclist = search_pattern(None, self.dbquery + \ ' -collection:"DELETED" -collection:"DUMMY"') else: reclist = search_pattern(None, self.dbquery + ' -collection:"DELETED"') reclist_with_nonpublic_subcolls = copy.deepcopy(reclist) # store the results: self.nbrecs = len(reclist_with_nonpublic_subcolls) self.reclist = reclist self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls # last but not least, update the speed-up flag: self.calculate_reclist_run_already = 1 # return the two sets: return (self.reclist, self.reclist_with_nonpublic_subcolls) def update_reclist(self): "Update the record universe for given collection; nbrecs, reclist of the collection table." if self.update_reclist_run_already: # do we have to reupdate? return 0 write_message("... updating reclist of %s (%s recs)" % (self.name, self.nbrecs), verbose=6) sys.stdout.flush() try: run_sql("UPDATE collection SET nbrecs=%s, reclist=%s WHERE id=%s", (self.nbrecs, self.reclist.fastdump(), self.id)) self.reclist_updated_since_start = 1 except Error, e: print "Database Query Error %d: %s." % (e.args[0], e.args[1]) sys.exit(1) # last but not least, update the speed-up flag: self.update_reclist_run_already = 1 return 0 def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() shift_re = re.compile("([-\+]{0,1})([\d]+)([dhms])") factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = shift_re.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def get_current_time_timestamp(): """Return timestamp corresponding to the current time.""" return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) def compare_timestamps_with_tolerance(timestamp1, timestamp2, tolerance=0): """Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form '2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument (in seconds). Return -1 if TIMESTAMP1 is less than TIMESTAMP2 minus TOLERANCE, 0 if they are equal within TOLERANCE limit, and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE. """ # remove any trailing .00 in timestamps: timestamp1 = re.sub(r'\.[0-9]+$', '', timestamp1) timestamp2 = re.sub(r'\.[0-9]+$', '', timestamp2) # first convert timestamps to Unix epoch seconds: timestamp1_seconds = calendar.timegm(time.strptime(timestamp1, "%Y-%m-%d %H:%M:%S")) timestamp2_seconds = calendar.timegm(time.strptime(timestamp2, "%Y-%m-%d %H:%M:%S")) # now compare them: if timestamp1_seconds < timestamp2_seconds - tolerance: return -1 elif timestamp1_seconds > timestamp2_seconds + tolerance: return 1 else: return 0 def get_database_last_updated_timestamp(): """Return last updated timestamp for collection-related and record-related database tables. """ database_tables_timestamps = [] database_tables_timestamps.append(get_table_update_time('bibrec')) database_tables_timestamps.append(get_table_update_time('bibfmt')) database_tables_timestamps.append(get_table_update_time('idxWORD%')) database_tables_timestamps.append(get_table_update_time('collection%')) database_tables_timestamps.append(get_table_update_time('portalbox')) database_tables_timestamps.append(get_table_update_time('field%')) database_tables_timestamps.append(get_table_update_time('format%')) database_tables_timestamps.append(get_table_update_time('rnkMETHODNAME')) return max(database_tables_timestamps) def get_cache_last_updated_timestamp(): """Return last updated cache timestamp.""" try: f = open(cfg_cache_last_updated_timestamp_file, "r") except: return "1970-01-01 00:00:00" timestamp = f.read() f.close() return timestamp def set_cache_last_updated_timestamp(timestamp): """Set last updated cache timestamp to TIMESTAMP.""" try: f = open(cfg_cache_last_updated_timestamp_file, "w") except: pass f.write(timestamp) f.close() return timestamp def main(): """Main that construct all the bibtask.""" task_init(authorization_action="runwebcoll", authorization_msg="WebColl Task Submission", description="""Description: webcoll updates the collection cache (record universe for a given collection plus web page elements) based on invenio.conf and DB configuration parameters. If the collection name is passed as the second argument, it'll update this collection only. If the collection name is immediately followed by a plus sign, it will also update all its desdendants. The top-level collection name may be entered as the void string.\n""", help_specific_usage=" -c, --collection\t Update cache for the given " "collection only. [all]\n" " -f, --force\t\t Force update even if cache is up to date. [no]\n" " -p, --part\t\t Update only certain cache parts (1=reclist," " 2=webpage). [both]\n" " -l, --language\t Update pages in only certain language" " (e.g. fr,it,...). [all]\n", version=__revision__, specific_params=("c:fp:l:", [ "collection=", "force", "part=", "language=" ]), task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter, task_submit_check_options_fnc=task_submit_check_options, task_run_fnc=task_run_core) def task_submit_elaborate_specific_parameter(key, value, opts, args): """ Given the string key it checks it's meaning, eventually using the value. Usually it fills some key in the options dict. It must return True if it has elaborated the key, False, if it doesn't know that key. eg: if key in ['-n', '--number']: self.options['number'] = value return True return False """ if key in ("-c", "--collection"): task_set_option("collection", value) elif key in ("-f", "--force"): task_set_option("force", 1) elif key in ("-p", "--part"): task_set_option("part", int(value)) elif key in ("-l", "--language"): languages = task_get_option("language", []) languages += value.split(',') for ln in languages: if ln not in CFG_SITE_LANGS: print 'ERROR: "%s" is not a recognized language code' % ln return False task_set_option("language", languages) else: return False return True def task_submit_check_options(): if task_has_option('collection'): coll = get_collection(task_get_option("collection")) if coll.id is None: print 'ERROR: Collection "%s" does not exist' % coll.name return False return True def task_run_core(): """ Reimplement to add the body of the task.""" task_run_start_timestamp = get_current_time_timestamp() colls = [] # decide whether we need to run or not, by comparing last updated timestamps: write_message("Database timestamp is %s." % get_database_last_updated_timestamp(), verbose=3) write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp(), verbose=3) if task_has_option("part"): write_message("Running cache update part %s only." % task_get_option("part"), verbose=3) if task_has_option("force") or \ compare_timestamps_with_tolerance(get_database_last_updated_timestamp(), get_cache_last_updated_timestamp(), cfg_cache_last_updated_timestamp_tolerance) >= 0: ## either forced update was requested or cache is not up to date, so recreate it: # firstly, decide which collections to do: if task_has_option("collection"): coll = get_collection(task_get_option("collection")) colls.append(coll) else: res = run_sql("SELECT name FROM collection ORDER BY id") for row in res: colls.append(get_collection(row[0])) # secondly, update collection reclist cache: if task_get_option('part', 1) == 1: i = 0 for coll in colls: i += 1 write_message("%s / reclist cache update" % coll.name) coll.calculate_reclist() task_sleep_now_if_required() coll.update_reclist() task_update_progress("Part 1/2: done %d/%d" % (i, len(colls))) task_sleep_now_if_required(can_stop_too=True) # thirdly, update collection webpage cache: if task_get_option("part", 2) == 2: i = 0 for coll in colls: i += 1 write_message("%s / webpage cache update" % coll.name) coll.update_webpage_cache() task_update_progress("Part 2/2: done %d/%d" % (i, len(colls))) task_sleep_now_if_required(can_stop_too=True) # finally update the cache last updated timestamp: # (but only when all collections were updated, not when only # some of them were forced-updated as per admin's demand) if not task_has_option("collection"): set_cache_last_updated_timestamp(task_run_start_timestamp) write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp(), verbose=3) else: ## cache up to date, we don't have to run write_message("Collection cache is up to date, no need to run.") ## we are done: return True ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/websearch/lib/websearch_webinterface.py b/modules/websearch/lib/websearch_webinterface.py index 001bf6efc..f2aa0ab4b 100644 --- a/modules/websearch/lib/websearch_webinterface.py +++ b/modules/websearch/lib/websearch_webinterface.py @@ -1,1182 +1,1203 @@ ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """WebSearch URL handler.""" __revision__ = "$Id$" import cgi import os import datetime import time from urllib import quote try: from mod_python import apache except ImportError: pass #maximum number of collaborating authors etc shown in GUI MAX_COLLAB_LIST = 10 MAX_KEYWORD_LIST = 10 MAX_VENUE_LIST = 10 #tag constants AUTHOR_TAG = "100__a" COAUTHOR_TAG = "700__a" AUTHOR_INST_TAG = "100__u" VENUE_TAG = "909C4p" KEYWORD_TAG = "6531_a" try: Set = set except NameError: from sets import Set from invenio.config import \ CFG_SITE_URL, \ CFG_SITE_NAME, \ CFG_CACHEDIR, \ CFG_SITE_LANG, \ CFG_SITE_ADMIN_EMAIL, \ CFG_SITE_SECURE_URL, \ CFG_WEBSEARCH_INSTANT_BROWSE_RSS, \ CFG_WEBSEARCH_RSS_TTL, \ CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS, \ CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, \ CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES, \ CFG_WEBDIR, \ CFG_WEBSEARCH_USE_JSMATH_FOR_FORMATS, \ CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS, \ CFG_WEBSEARCH_PERMITTED_RESTRICTED_COLLECTIONS_LEVEL, \ CFG_WEBSEARCH_USE_ALEPH_SYSNOS from invenio.dbquery import Error from invenio.webinterface_handler import wash_urlargd, WebInterfaceDirectory from invenio.urlutils import redirect_to_url, make_canonical_urlargd, drop_default_urlargd, create_html_link from invenio.webuser import getUid, page_not_authorized, get_user_preferences, \ collect_user_info, http_check_credentials, logoutUser, isUserSuperAdmin, \ session_param_get from invenio import search_engine from invenio.websubmit_webinterface import WebInterfaceFilesPages from invenio.bibclassify_webinterface import WebInterfaceKeywordsPages from invenio.webcomment_webinterface import WebInterfaceCommentsPages from invenio.bibcirculation_webinterface import WebInterfaceHoldingsPages from invenio.webpage import page, create_error_box from invenio.messages import gettext_set_language from invenio.search_engine import get_colID, get_coll_i18nname, \ check_user_can_view_record, collection_restricted_p, restricted_collection_cache, \ get_fieldvalues, get_most_popular_field_values, get_mysql_recid_from_aleph_sysno from invenio.access_control_engine import acc_authorize_action from invenio.access_control_config import VIEWRESTRCOLL from invenio.access_control_mailcookie import mail_cookie_create_authorize_action from invenio.bibformat import format_records from invenio.bibformat_engine import get_output_formats from invenio.websearch_webcoll import mymkdir, get_collection from invenio.intbitset import intbitset from invenio.bibupload import find_record_from_sysno from invenio.bibrank_citation_searcher import get_author_cited_by, get_cited_by_list from invenio.bibrank_downloads_indexer import get_download_weight_total from invenio.search_engine_summarizer import summarize_records from invenio.errorlib import register_exception from invenio.bibedit_webinterface import WebInterfaceEditPages from invenio.bibeditmulti_webinterface import WebInterfaceMultiEditPages from invenio.bibmerge_webinterface import WebInterfaceMergePages import invenio.template websearch_templates = invenio.template.load('websearch') search_results_default_urlargd = websearch_templates.search_results_default_urlargd search_interface_default_urlargd = websearch_templates.search_interface_default_urlargd try: output_formats = [output_format['attrs']['code'].lower() for output_format in \ get_output_formats(with_attributes=True).values()] except KeyError: output_formats = ['xd', 'xm', 'hd', 'hb', 'hs', 'hx'] output_formats.extend(['hm', 't', 'h']) def wash_search_urlargd(form): """ Create canonical search arguments from those passed via web form. """ argd = wash_urlargd(form, search_results_default_urlargd) + if argd.has_key('as'): + argd['aas'] = argd['as'] + del argd['as'] # Sometimes, users pass ot=245,700 instead of # ot=245&ot=700. Normalize that. ots = [] for ot in argd['ot']: ots += ot.split(',') argd['ot'] = ots # We can either get the mode of function as # action=<browse|search>, or by setting action_browse or # action_search. if argd['action_browse']: argd['action'] = 'browse' elif argd['action_search']: argd['action'] = 'search' else: if argd['action'] not in ('browse', 'search'): argd['action'] = 'search' del argd['action_browse'] del argd['action_search'] return argd class WebInterfaceUnAPIPages(WebInterfaceDirectory): """ Handle /unapi set of pages.""" _exports = [''] def __call__(self, req, form): argd = wash_urlargd(form, { 'id' : (int, 0), 'format' : (str, '')}) formats_dict = get_output_formats(True) formats = {} for format in formats_dict.values(): if format['attrs']['visibility']: formats[format['attrs']['code'].lower()] = format['attrs']['content_type'] del formats_dict if argd['id'] and argd['format']: ## Translate back common format names format = { 'nlm' : 'xn', 'marcxml' : 'xm', 'dc' : 'xd', 'endnote' : 'xe', 'mods' : 'xo' }.get(argd['format'], argd['format']) if format in formats: redirect_to_url(req, '%s/record/%s/export/%s' % (CFG_SITE_URL, argd['id'], format)) else: raise apache.SERVER_RETURN, apache.HTTP_NOT_ACCEPTABLE elif argd['id']: return websearch_templates.tmpl_unapi(formats, identifier=argd['id']) else: return websearch_templates.tmpl_unapi(formats) index = __call__ class WebInterfaceAuthorPages(WebInterfaceDirectory): """ Handle /author/Doe%2C+John etc set of pages.""" _exports = ['author'] def __init__(self, authorname=''): """Constructor.""" self.authorname = authorname def _lookup(self, component, path): """This handler parses dynamic URLs (/author/John+Doe).""" return WebInterfaceAuthorPages(component), path def __call__(self, req, form): """Serve the page in the given language.""" argd = wash_urlargd(form, {'ln': (str, CFG_SITE_LANG), 'verbose': (int, 0) }) ln = argd['ln'] verbose = argd['verbose'] req.argd = argd #needed since perform_req_search # start page req.content_type = "text/html" req.send_http_header() uid = getUid(req) search_engine.page_start(req, "hb", "", "", ln, uid) #wants to check it in case of no results self.authorname = self.authorname.replace("+"," ") if not self.authorname: return websearch_templates.tmpl_author_information(req, {}, self.authorname, 0, {}, {}, {}, {}, {}, ln) #let's see what takes time.. time1 = time.time() genstart = time1 citelist = get_author_cited_by(self.authorname) time2 = time.time() if verbose == 9: req.write("<br/>citelist generation took: "+str(time2-time1)+"<br/>") #search the publications by this author pubs = search_engine.perform_request_search(req=req, p=self.authorname, f="author") #get most frequent authors of these pubs popular_author_tuples = search_engine.get_most_popular_field_values(pubs, (AUTHOR_TAG, COAUTHOR_TAG)) authors= [] for (auth, frequency) in popular_author_tuples: if len(authors) < MAX_COLLAB_LIST: authors.append(auth) time1 = time.time() if verbose == 9: req.write("<br/>popularized authors: "+str(time1-time2)+"<br/>") #and publication venues venuetuples = search_engine.get_most_popular_field_values(pubs, (VENUE_TAG)) time2 = time.time() if verbose == 9: req.write("<br/>venues: "+str(time2-time1)+"<br/>") #and keywords kwtuples = search_engine.get_most_popular_field_values(pubs, (KEYWORD_TAG)) time1 = time.time() if verbose == 9: req.write("<br/>keywords: "+str(time1-time2)+"<br/>") #construct a simple list of tuples that contains keywords that appear more than once #moreover, limit the length of the list to MAX_KEYWORD_LIST kwtuples = kwtuples[0:MAX_KEYWORD_LIST] vtuples = venuetuples[0:MAX_VENUE_LIST] #remove the author in question from authors: they are associates if (authors.count(self.authorname) > 0): authors.remove(self.authorname) authors = authors[0:MAX_COLLAB_LIST] #cut extra time2 = time.time() if verbose == 9: req.write("<br/>misc: "+str(time2-time1)+"<br/>") #a dict. keys: affiliations, values: lists of publications author_aff_pubs = self.get_institute_pub_dict(pubs) authoraffs = author_aff_pubs.keys() time1 = time.time() if verbose == 9: req.write("<br/>affiliations: "+str(time1-time2)+"<br/>") #find out how many times these records have been downloaded recsloads = {} recsloads = get_download_weight_total(recsloads, pubs) #sum up totaldownloads = 0 for k in recsloads.keys(): totaldownloads = totaldownloads + recsloads[k] #get cited by.. citedbylist = get_cited_by_list(pubs) time1 = time.time() if verbose == 9: req.write("<br/>citedby: "+str(time1-time2)+"<br/>") #finally all stuff there, call the template websearch_templates.tmpl_author_information(req, pubs, self.authorname, totaldownloads, author_aff_pubs, citedbylist, kwtuples, authors, vtuples, ln) time1 = time.time() #cited-by summary out = summarize_records(intbitset(pubs), 'hcs', ln, self.authorname, 'author', req) time2 = time.time() if verbose == 9: req.write("<br/>summarizer: "+str(time2-time1)+"<br/>") req.write(out) simauthbox = search_engine.create_similarly_named_authors_link_box(self.authorname) req.write(simauthbox) if verbose == 9: req.write("<br/>all: "+str(time.time()-genstart)+"<br/>") return search_engine.page_end(req, 'hb', ln) def get_institute_pub_dict(self, recids): #return a dictionary consisting of institute -> list of publications affus = [] #list of insts from the record author_aff_pubs = {} #the disct to be build for recid in recids: #iterate all so that we get first author's intitute #if this the first author OR #"his" institute if he is an affliate author mainauthors = get_fieldvalues(recid, AUTHOR_TAG) mainauthor = " " if mainauthors: mainauthor = mainauthors[0] if (mainauthor == self.authorname): affus = get_fieldvalues(recid, AUTHOR_INST_TAG) #if this is empty, add a dummy " " value if (affus == []): affus = [" "] for a in affus: #add in author_aff_pubs if (author_aff_pubs.has_key(a)): tmp = author_aff_pubs[a] tmp.append(recid) author_aff_pubs[a] = tmp else: author_aff_pubs[a] = [recid] return author_aff_pubs index = __call__ class WebInterfaceRecordPages(WebInterfaceDirectory): """ Handling of a /record/<recid> URL fragment """ _exports = ['', 'files', 'reviews', 'comments', 'usage', 'references', 'export', 'citations', 'holdings', 'edit', 'keywords', 'multiedit', 'merge'] #_exports.extend(output_formats) def __init__(self, recid, tab, format=None): self.recid = recid self.tab = tab self.format = format self.export = self self.files = WebInterfaceFilesPages(self.recid) self.reviews = WebInterfaceCommentsPages(self.recid, reviews=1) self.comments = WebInterfaceCommentsPages(self.recid) self.usage = self self.references = self self.holdings = WebInterfaceHoldingsPages(self.recid) self.keywords = WebInterfaceKeywordsPages(self.recid) self.citations = self self.export = WebInterfaceRecordExport(self.recid, self.format) self.edit = WebInterfaceEditPages(self.recid) self.merge = WebInterfaceMergePages(self.recid) return def __call__(self, req, form): argd = wash_search_urlargd(form) argd['recid'] = self.recid argd['tab'] = self.tab if self.format is not None: argd['of'] = self.format req.argd = argd uid = getUid(req) if uid == -1: return page_not_authorized(req, "../", text="You are not authorized to view this record.", navmenuid='search') elif uid > 0: pref = get_user_preferences(uid) try: if not form.has_key('rg'): # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass user_info = collect_user_info(req) (auth_code, auth_msg) = check_user_can_view_record(user_info, self.recid) if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and not isUserSuperAdmin(user_info): argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS if auth_code and user_info['email'] == 'guest' and not user_info['apache_user']: cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : search_engine.guess_primary_collection_of_a_record(self.recid)}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target) elif auth_code: return page_not_authorized(req, "../", \ text = auth_msg,\ navmenuid='search') # mod_python does not like to return [] in case when of=id: out = search_engine.perform_request_search(req, **argd) if out == []: return str(out) else: return out # Return the same page wether we ask for /record/123 or /record/123/ index = __call__ class WebInterfaceRecordRestrictedPages(WebInterfaceDirectory): """ Handling of a /record-restricted/<recid> URL fragment """ _exports = ['', 'files', 'reviews', 'comments', 'usage', 'references', 'export', 'citations', 'holdings', 'edit', 'keywords', 'multiedit', 'merge'] #_exports.extend(output_formats) def __init__(self, recid, tab, format=None): self.recid = recid self.tab = tab self.format = format self.files = WebInterfaceFilesPages(self.recid) self.reviews = WebInterfaceCommentsPages(self.recid, reviews=1) self.comments = WebInterfaceCommentsPages(self.recid) self.usage = self self.references = self self.keywords = WebInterfaceKeywordsPages(self.recid) self.holdings = WebInterfaceHoldingsPages(self.recid) self.citations = self self.export = WebInterfaceRecordExport(self.recid, self.format) self.edit = WebInterfaceEditPages(self.recid) self.merge = WebInterfaceMergePages(self.recid) return def __call__(self, req, form): argd = wash_search_urlargd(form) argd['recid'] = self.recid if self.format is not None: argd['of'] = self.format req.argd = argd uid = getUid(req) user_info = collect_user_info(req) if uid == -1: return page_not_authorized(req, "../", text="You are not authorized to view this record.", navmenuid='search') elif uid > 0: pref = get_user_preferences(uid) try: if not form.has_key('rg'): # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and not isUserSuperAdmin(user_info): argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS record_primary_collection = search_engine.guess_primary_collection_of_a_record(self.recid) if collection_restricted_p(record_primary_collection): (auth_code, dummy) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=record_primary_collection) if auth_code: return page_not_authorized(req, "../", text="You are not authorized to view this record.", navmenuid='search') # Keep all the arguments, they might be reused in the # record page itself to derivate other queries req.argd = argd # mod_python does not like to return [] in case when of=id: out = search_engine.perform_request_search(req, **argd) if out == []: return str(out) else: return out # Return the same page wether we ask for /record/123 or /record/123/ index = __call__ class WebInterfaceSearchResultsPages(WebInterfaceDirectory): """ Handling of the /search URL and its sub-pages. """ _exports = ['', 'authenticate', 'cache', 'log'] def __call__(self, req, form): """ Perform a search. """ argd = wash_search_urlargd(form) _ = gettext_set_language(argd['ln']) if req.method == 'POST': raise apache.SERVER_RETURN, apache.HTTP_METHOD_NOT_ALLOWED uid = getUid(req) user_info = collect_user_info(req) if uid == -1: return page_not_authorized(req, "../", text = _("You are not authorized to view this area."), navmenuid='search') elif uid > 0: pref = get_user_preferences(uid) try: if not form.has_key('rg'): # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass if CFG_WEBSEARCH_PERMITTED_RESTRICTED_COLLECTIONS_LEVEL == 2: ## Let's update the current collections list with all ## the restricted collections the user has rights to view. try: restricted_collections = user_info['precached_permitted_restricted_collections'] argd_collections = Set(argd['c']) argd_collections.update(restricted_collections) argd['c'] = list(argd_collections) except KeyError: pass if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and not isUserSuperAdmin(user_info): argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS involved_collections = Set() involved_collections.update(argd['c']) involved_collections.add(argd['cc']) if argd['id'] > 0: argd['recid'] = argd['id'] if argd['idb'] > 0: argd['recidb'] = argd['idb'] if argd['sysno']: tmp_recid = find_record_from_sysno(argd['sysno']) if tmp_recid: argd['recid'] = tmp_recid if argd['sysnb']: tmp_recid = find_record_from_sysno(argd['sysnb']) if tmp_recid: argd['recidb'] = tmp_recid if argd['recid'] > 0: if argd['recidb'] > argd['recid']: # Hack to check if among the restricted collections # at least a record of the range is there and # then if the user is not authorized for that # collection. recids = intbitset(xrange(argd['recid'], argd['recidb'])) restricted_collection_cache.recreate_cache_if_needed() for collname in restricted_collection_cache.cache: (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=collname) if auth_code and user_info['email'] == 'guest' and not user_info['apache_user']: coll_recids = get_collection(collname).reclist if coll_recids & recids: cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : collname}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target) elif auth_code: return page_not_authorized(req, "../", \ text = auth_msg,\ navmenuid='search') else: involved_collections.add(search_engine.guess_primary_collection_of_a_record(argd['recid'])) # If any of the collection requires authentication, redirect # to the authentication form. for coll in involved_collections: if collection_restricted_p(coll): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll) if auth_code and user_info['email'] == 'guest' and not user_info['apache_user']: cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : coll}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target) elif auth_code: return page_not_authorized(req, "../", \ text = auth_msg,\ navmenuid='search') # Keep all the arguments, they might be reused in the # search_engine itself to derivate other queries req.argd = argd # mod_python does not like to return [] in case when of=id: out = search_engine.perform_request_search(req, **argd) if out == []: return str(out) else: return out def cache(self, req, form): """Search cache page.""" argd = wash_urlargd(form, {'action': (str, 'show')}) return search_engine.perform_request_cache(req, action=argd['action']) def log(self, req, form): """Search log page.""" argd = wash_urlargd(form, {'date': (str, '')}) return search_engine.perform_request_log(req, date=argd['date']) def authenticate(self, req, form): """Restricted search results pages.""" argd = wash_search_urlargd(form) user_info = collect_user_info(req) for coll in argd['c'] + [argd['cc']]: if collection_restricted_p(coll): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll) if auth_code and user_info['email'] == 'guest' and not user_info['apache_user']: cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : coll}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target) elif auth_code: return page_not_authorized(req, "../", \ text = auth_msg,\ navmenuid='search') # Keep all the arguments, they might be reused in the # search_engine itself to derivate other queries req.argd = argd uid = getUid(req) if uid > 0: pref = get_user_preferences(uid) try: if not form.has_key('rg'): # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass # mod_python does not like to return [] in case when of=id: out = search_engine.perform_request_search(req, **argd) if out == []: return str(out) else: return out index = __call__ class WebInterfaceLegacySearchPages(WebInterfaceDirectory): """ Handling of the /search.py URL and its sub-pages. """ _exports = ['', ('authenticate', 'index')] def __call__(self, req, form): """ Perform a search. """ argd = wash_search_urlargd(form) # We either jump into the generic search form, or the specific # /record/... display if a recid is requested if argd['recid'] != -1: target = '/record/%d' % argd['recid'] del argd['recid'] else: target = '/search' target += make_canonical_urlargd(argd, search_results_default_urlargd) return redirect_to_url(req, target, apache.HTTP_MOVED_PERMANENTLY) index = __call__ # Parameters for the legacy URLs, of the form /?c=ALEPH legacy_collection_default_urlargd = { 'as': (int, CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE), + 'aas': (int, CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE), 'verbose': (int, 0), 'c': (str, CFG_SITE_NAME)} class WebInterfaceSearchInterfacePages(WebInterfaceDirectory): """ Handling of collection navigation.""" _exports = [('index.py', 'legacy_collection'), ('', 'legacy_collection'), ('search.py', 'legacy_search'), 'search', 'openurl', 'testsso', 'logout_SSO_hook'] search = WebInterfaceSearchResultsPages() legacy_search = WebInterfaceLegacySearchPages() def testsso(self, req, form): """ For testing single sign-on """ req.add_common_vars() sso_env = {} for var, value in req.subprocess_env.iteritems(): if var.startswith('HTTP_ADFS_'): sso_env[var] = value out = "<HTML><HEAD><TITLE>SSO test</TITLE</HEAD>" out += "<BODY><TABLE>" for var, value in sso_env.iteritems(): out += "<TR><TD><STRONG>%s</STRONG></TD><TD>%s</TD></TR>" % (var, value) out += "</TABLE></BODY></HTML>" return out def logout_SSO_hook(self, req, form): """Script triggered by the display of the centralized SSO logout dialog. It logouts the user from CDS Invenio and stream back the expected picture.""" logoutUser(req) req.content_type = 'image/gif' req.encoding = None req.filename = 'wsignout.gif' req.headers_out["Content-Disposition"] = "inline; filename=wsignout.gif" req.set_content_length(os.path.getsize('%s/img/wsignout.gif' % CFG_WEBDIR)) req.send_http_header() req.sendfile('%s/img/wsignout.gif' % CFG_WEBDIR) def _lookup(self, component, path): """ This handler is invoked for the dynamic URLs (for collections and records)""" if component == 'collection': c = '/'.join(path) def answer(req, form): """Accessing collections cached pages.""" # Accessing collections: this is for accessing the # cached page on top of each collection. argd = wash_urlargd(form, search_interface_default_urlargd) # We simply return the cached page of the collection argd['c'] = c if not argd['c']: # collection argument not present; display # home collection by default argd['c'] = CFG_SITE_NAME + + # Treat `as' argument specially: + if argd.has_key('as'): + argd['aas'] = argd['as'] + del argd['as'] + return display_collection(req, **argd) return answer, [] elif component == 'record' and path and path[0] == 'merge': return WebInterfaceMergePages(), path[1:] elif component == 'record' and path and path[0] == 'edit': return WebInterfaceEditPages(), path[1:] - + elif component == 'record' and path[0] == 'multiedit': return WebInterfaceMultiEditPages(), path[1:] elif component == 'record' or component == 'record-restricted': try: if CFG_WEBSEARCH_USE_ALEPH_SYSNOS: # let us try to recognize /record/<SYSNO> style of URLs: x = get_mysql_recid_from_aleph_sysno(path[0]) if x: recid = x else: recid = int(path[0]) else: recid = int(path[0]) except IndexError: # display record #1 for URL /record without a number recid = 1 except ValueError: if path[0] == '': # display record #1 for URL /record/ without a number recid = 1 else: # display page not found for URLs like /record/foo return None, [] if recid <= 0: # display page not found for URLs like /record/-5 or /record/0 return None, [] format = None tab = '' try: if path[1] in ['', 'files', 'reviews', 'comments','usage', 'references', 'citations', 'holdings', 'edit', 'keywords', 'multiedit', 'merge']: tab = path[1] elif path[1] == 'export': tab = '' format = path[2] # format = None # elif path[1] in output_formats: # tab = '' # format = path[1] else: # display page not found for URLs like /record/references # for a collection where 'references' tabs is not visible return None, [] except IndexError: # Keep normal url if tabs is not specified pass #if component == 'record-restricted': #return WebInterfaceRecordRestrictedPages(recid, tab, format), path[1:] #else: return WebInterfaceRecordPages(recid, tab, format), path[1:] return None, [] def openurl(self, req, form): """ OpenURL Handler.""" argd = wash_urlargd(form, websearch_templates.tmpl_openurl_accepted_args) ret_url = websearch_templates.tmpl_openurl2invenio(argd) if ret_url: return redirect_to_url(req, ret_url) else: return redirect_to_url(req, CFG_SITE_URL) def legacy_collection(self, req, form): """Collection URL backward compatibility handling.""" accepted_args = dict(legacy_collection_default_urlargd) accepted_args.update({'referer' : (str, ''), 'realm' : (str, '')}) argd = wash_urlargd(form, accepted_args) # Apache authentication stuff if argd['realm']: http_check_credentials(req, argd['realm']) return redirect_to_url(req, argd['referer'] or '%s/youraccount/youradminactivities' % CFG_SITE_SECURE_URL) del argd['referer'] del argd['realm'] + # Treat `as' argument specially: + if argd.has_key('as'): + argd['aas'] = argd['as'] + del argd['as'] + # If we specify no collection, then we don't need to redirect # the user, so that accessing <http://yoursite/> returns the # default collection. if not form.has_key('c'): return display_collection(req, **argd) # make the collection an element of the path, and keep the # other query elements as is. If the collection is CFG_SITE_NAME, # however, redirect to the main URL. c = argd['c'] del argd['c'] if c == CFG_SITE_NAME: target = '/' else: target = '/collection/' + quote(c) + # Treat `as' argument specially: + # We are going to redirect, so replace `aas' by `as' visible argument: + if argd.has_key('aas'): + argd['as'] = argd['aas'] + del argd['aas'] + target += make_canonical_urlargd(argd, legacy_collection_default_urlargd) return redirect_to_url(req, target) -def display_collection(req, c, as, verbose, ln): +def display_collection(req, c, aas, verbose, ln): """Display search interface page for collection c by looking in the collection cache.""" _ = gettext_set_language(ln) - req.argd = drop_default_urlargd({'as': as, 'verbose': verbose, 'ln': ln}, + req.argd = drop_default_urlargd({'aas': aas, 'verbose': verbose, 'ln': ln}, search_interface_default_urlargd) # get user ID: try: uid = getUid(req) user_preferences = {} if uid == -1: return page_not_authorized(req, "../", text="You are not authorized to view this collection", navmenuid='search') elif uid > 0: user_preferences = get_user_preferences(uid) except Error: register_exception(req=req, alert_admin=True) return page(title=_("Internal Error"), body = create_error_box(req, verbose=verbose, ln=ln), description="%s - Internal Error" % CFG_SITE_NAME, keywords="%s, Internal Error" % CFG_SITE_NAME, language=ln, req=req, navmenuid='search') # start display: req.content_type = "text/html" req.send_http_header() # deduce collection id: colID = get_colID(c) if type(colID) is not int: page_body = '<p>' + (_("Sorry, collection %s does not seem to exist.") % ('<strong>' + str(c) + '</strong>')) + '</p>' page_body = '<p>' + (_("You may want to start browsing from %s.") % ('<a href="' + CFG_SITE_URL + '?ln=' + ln + '">' + get_coll_i18nname(CFG_SITE_NAME, ln) + '</a>')) + '</p>' if req.header_only: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND return page(title=_("Collection %s Not Found") % cgi.escape(c), body=page_body, description=(CFG_SITE_NAME + ' - ' + _("Not found") + ': ' + cgi.escape(str(c))), keywords="%s" % CFG_SITE_NAME, uid=uid, language=ln, req=req, navmenuid='search') - # wash `as' argument: + # wash `aas' argument: if not os.path.exists("%s/collections/%d/body-as=%d-ln=%s.html" % \ - (CFG_CACHEDIR, colID, as, ln)): - # nonexistent `as' asked for, fall back to Simple Search: - as = 0 + (CFG_CACHEDIR, colID, aas, ln)): + # nonexistent `aas' asked for, fall back to Simple Search: + aas = 0 # display collection interface page: try: filedesc = open("%s/collections/%d/navtrail-as=%d-ln=%s.html" % \ - (CFG_CACHEDIR, colID, as, ln), "r") + (CFG_CACHEDIR, colID, aas, ln), "r") c_navtrail = filedesc.read() filedesc.close() except: c_navtrail = "" try: filedesc = open("%s/collections/%d/body-as=%d-ln=%s.html" % \ - (CFG_CACHEDIR, colID, as, ln), "r") + (CFG_CACHEDIR, colID, aas, ln), "r") c_body = filedesc.read() filedesc.close() except: c_body = "" try: filedesc = open("%s/collections/%d/portalbox-tp-ln=%s.html" % \ (CFG_CACHEDIR, colID, ln), "r") c_portalbox_tp = filedesc.read() filedesc.close() except: c_portalbox_tp = "" try: filedesc = open("%s/collections/%d/portalbox-te-ln=%s.html" % \ (CFG_CACHEDIR, colID, ln), "r") c_portalbox_te = filedesc.read() filedesc.close() except: c_portalbox_te = "" try: filedesc = open("%s/collections/%d/portalbox-lt-ln=%s.html" % \ (CFG_CACHEDIR, colID, ln), "r") c_portalbox_lt = filedesc.read() filedesc.close() except: c_portalbox_lt = "" try: # show help boxes (usually located in "tr", "top right") # if users have not banned them in their preferences: c_portalbox_rt = "" if user_preferences.get('websearch_helpbox', 1) > 0: filedesc = open("%s/collections/%d/portalbox-rt-ln=%s.html" % \ (CFG_CACHEDIR, colID, ln), "r") c_portalbox_rt = filedesc.read() filedesc.close() except: c_portalbox_rt = "" try: filedesc = open("%s/collections/%d/last-updated-ln=%s.html" % \ (CFG_CACHEDIR, colID, ln), "r") c_last_updated = filedesc.read() filedesc.close() except: c_last_updated = "" try: title = get_coll_i18nname(c, ln) # if there is only one collection defined, do not print its # title on the page as it would be displayed repetitively. if len(search_engine.collection_reclist_cache.cache.keys()) == 1: title = "" except: title = "" # RSS: rssurl = CFG_SITE_URL + '/rss' if c != CFG_SITE_NAME: rssurl += '?cc=' + quote(c) if 'hb' in CFG_WEBSEARCH_USE_JSMATH_FOR_FORMATS: metaheaderadd = """ <script type='text/javascript'> jsMath = { Controls: {cookie: {printwarn: 0}} }; </script> <script src='/jsMath/easy/invenio-jsmath.js' type='text/javascript'></script> """ else: metaheaderadd = '' return page(title=title, body=c_body, navtrail=c_navtrail, description="%s - %s" % (CFG_SITE_NAME, c), keywords="%s, %s" % (CFG_SITE_NAME, c), metaheaderadd=metaheaderadd, uid=uid, language=ln, req=req, cdspageboxlefttopadd=c_portalbox_lt, cdspageboxrighttopadd=c_portalbox_rt, titleprologue=c_portalbox_tp, titleepilogue=c_portalbox_te, lastupdated=c_last_updated, navmenuid='search', rssurl=rssurl, show_title_p=-1 not in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES) class WebInterfaceRSSFeedServicePages(WebInterfaceDirectory): """RSS 2.0 feed service pages.""" def __call__(self, req, form): """RSS 2.0 feed service.""" # Keep only interesting parameters for the search default_params = websearch_templates.rss_default_urlargd # We need to keep 'jrec' and 'rg' here in order to have # 'multi-page' RSS. These parameters are not kept be default # as we don't want to consider them when building RSS links # from search and browse pages. default_params.update({'jrec':(int, 1), 'rg': (int, CFG_WEBSEARCH_INSTANT_BROWSE_RSS)}) argd = wash_urlargd(form, default_params) for coll in argd['c'] + [argd['cc']]: if collection_restricted_p(coll): user_info = collect_user_info(req) (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll) if auth_code and user_info['email'] == 'guest' and not user_info['apache_user']: cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : coll}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target) elif auth_code: return page_not_authorized(req, "../", \ text = auth_msg,\ navmenuid='search') # Create a standard filename with these parameters current_url = websearch_templates.build_rss_url(argd) cache_filename = current_url.split('/')[-1] # In the same way as previously, add 'jrec' & 'rg' req.content_type = "application/rss+xml" req.send_http_header() try: # Try to read from cache path = "%s/rss/%s.xml" % (CFG_CACHEDIR, cache_filename) # Check if cache needs refresh filedesc = open(path, "r") last_update_time = datetime.datetime.fromtimestamp(os.stat(os.path.abspath(path)).st_mtime) assert(datetime.datetime.now() < last_update_time + datetime.timedelta(minutes=CFG_WEBSEARCH_RSS_TTL)) c_rss = filedesc.read() filedesc.close() req.write(c_rss) return except Exception, e: # do it live and cache previous_url = None if argd['jrec'] > 1: prev_jrec = argd['jrec'] - argd['rg'] if prev_jrec < 1: prev_jrec = 1 previous_url = websearch_templates.build_rss_url(argd, jrec=prev_jrec) recIDs = search_engine.perform_request_search(req, of="id", c=argd['c'], cc=argd['cc'], p=argd['p'], f=argd['f'], p1=argd['p1'], f1=argd['f1'], m1=argd['m1'], op1=argd['op1'], p2=argd['p2'], f2=argd['f2'], m2=argd['m2'], op2=argd['op2'], p3=argd['p3'], f3=argd['f3'], m3=argd['m3']) next_url = None if len(recIDs) >= argd['jrec'] + argd['rg']: next_url = websearch_templates.build_rss_url(argd, jrec=(argd['jrec'] + argd['rg'])) recIDs = recIDs[-argd['jrec']:(-argd['rg']-argd['jrec']):-1] rss_prologue = '<?xml version="1.0" encoding="UTF-8"?>\n' + \ websearch_templates.tmpl_xml_rss_prologue(current_url=current_url, previous_url=previous_url, next_url=next_url) + '\n' req.write(rss_prologue) rss_body = format_records(recIDs, of='xr', record_separator="\n", req=req, epilogue="\n") rss_epilogue = websearch_templates.tmpl_xml_rss_epilogue() + '\n' req.write(rss_epilogue) # update cache dirname = "%s/rss" % (CFG_CACHEDIR) mymkdir(dirname) fullfilename = "%s/rss/%s.xml" % (CFG_CACHEDIR, cache_filename) try: # Remove the file just in case it already existed # so that a bit of space is created os.remove(fullfilename) except OSError: pass # Check if there's enough space to cache the request. if len(os.listdir(dirname)) < CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS: try: os.umask(022) f = open(fullfilename, "w") f.write(rss_prologue + rss_body + rss_epilogue) f.close() except IOError, v: if v[0] == 36: # URL was too long. Never mind, don't cache pass else: raise repr(v) index = __call__ class WebInterfaceRecordExport(WebInterfaceDirectory): """ Handling of a /record/<recid>/export/<format> URL fragment """ _exports = output_formats def __init__(self, recid, format=None): self.recid = recid self.format = format for output_format in output_formats: self.__dict__[output_format] = self return def __call__(self, req, form): argd = wash_search_urlargd(form) argd['recid'] = self.recid if self.format is not None: argd['of'] = self.format req.argd = argd uid = getUid(req) if uid == -1: return page_not_authorized(req, "../", text="You are not authorized to view this record.", navmenuid='search') elif uid > 0: pref = get_user_preferences(uid) try: if not form.has_key('rg'): # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass # Check if the record belongs to a restricted primary # collection. If yes, redirect to the authenticated URL. user_info = collect_user_info(req) (auth_code, auth_msg) = check_user_can_view_record(user_info, self.recid) if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and not isUserSuperAdmin(user_info): argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS if auth_code and user_info['email'] == 'guest' and not user_info['apache_user']: cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : search_engine.guess_primary_collection_of_a_record(self.recid)}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target) elif auth_code: return page_not_authorized(req, "../", \ text = auth_msg,\ navmenuid='search') # mod_python does not like to return [] in case when of=id: out = search_engine.perform_request_search(req, **argd) if out == []: return str(out) else: return out # Return the same page wether we ask for /record/123/export/xm or /record/123/export/xm/ index = __call__ diff --git a/modules/websubmit/lib/websubmit_webinterface.py b/modules/websubmit/lib/websubmit_webinterface.py index 83f29b716..45403e7f8 100644 --- a/modules/websubmit/lib/websubmit_webinterface.py +++ b/modules/websubmit/lib/websubmit_webinterface.py @@ -1,745 +1,745 @@ ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. __lastupdated__ = """$Date$""" __revision__ = "$Id$" import os import time import cgi try: from mod_python import apache except ImportError: pass from urllib import unquote, urlencode from invenio.config import \ CFG_ACCESS_CONTROL_LEVEL_SITE, \ CFG_SITE_LANG, \ CFG_SITE_NAME, \ CFG_SITE_NAME_INTL, \ CFG_SITE_URL, \ CFG_SITE_SECURE_URL, \ CFG_WEBSUBMIT_STORAGEDIR, \ CFG_PREFIX from invenio.dbquery import run_sql from invenio.access_control_config import VIEWRESTRCOLL from invenio.access_control_mailcookie import mail_cookie_create_authorize_action from invenio.access_control_engine import acc_authorize_action from invenio.webpage import page, create_error_box, pageheaderonly, \ pagefooteronly from invenio.webuser import getUid, page_not_authorized, collect_user_info, isGuestUser from invenio.websubmit_config import * from invenio.webinterface_handler import wash_urlargd, WebInterfaceDirectory from invenio.urlutils import make_canonical_urlargd, redirect_to_url from invenio.messages import gettext_set_language from invenio.search_engine import \ guess_primary_collection_of_a_record, \ get_colID, \ create_navtrail_links, check_user_can_view_record from invenio.bibdocfile import BibRecDocs, normalize_format, file_strip_ext, \ stream_restricted_icon, BibDoc, InvenioWebSubmitFileError, stream_file from invenio.errorlib import register_exception from invenio.websubmit_icon_creator import create_icon, InvenioWebSubmitIconCreatorError import invenio.template websubmit_templates = invenio.template.load('websubmit') from invenio.websearchadminlib import get_detailed_page_tabs import invenio.template webstyle_templates = invenio.template.load('webstyle') websearch_templates = invenio.template.load('websearch') try: from invenio.fckeditor_invenio_connector import FCKeditorConnectorInvenio fckeditor_available = True except ImportError, e: fckeditor_available = False class WebInterfaceFilesPages(WebInterfaceDirectory): def __init__(self,recid): self.recid = recid def _lookup(self, component, path): # after /record/<recid>/files/ every part is used as the file # name filename = unquote(component) def getfile(req, form): args = wash_urlargd(form, websubmit_templates.files_default_urlargd) ln = args['ln'] _ = gettext_set_language(ln) uid = getUid(req) user_info = collect_user_info(req) verbose = args['verbose'] if verbose >= 1 and acc_authorize_action(user_info, 'fulltext')[0] != 0: # Only SuperUser can see all the details! verbose = 0 if uid == -1 or CFG_ACCESS_CONTROL_LEVEL_SITE > 1: return page_not_authorized(req, "/record/%s" % self.recid, navmenuid='submit') (auth_code, auth_msg) = check_user_can_view_record(user_info, self.recid) if auth_code and user_info['email'] == 'guest' and not user_info['apache_user']: cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : guess_primary_collection_of_a_record(self.recid)}) target = '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : ln, 'referer' : \ CFG_SITE_URL + user_info['uri']}, {}) return redirect_to_url(req, target) elif auth_code: return page_not_authorized(req, "../", \ text = auth_msg) readonly = CFG_ACCESS_CONTROL_LEVEL_SITE == 1 # From now on: either the user provided a specific file # name (and a possible version), or we return a list of # all the available files. In no case are the docids # visible. try: bibarchive = BibRecDocs(self.recid) except InvenioWebSubmitFileError, e: register_exception(req=req, alert_admin=True) msg = "<p>%s</p><p>%s</p>" % ( _("The system has encountered an error in retrieving the list of files for this document."), _("The error has been logged and will be taken in consideration as soon as possible.")) return print_warning(msg) docname = '' format = '' version = '' if filename: # We know the complete file name, guess which docid it # refers to ## TODO: Change the extension system according to ext.py from setlink ## and have a uniform extension mechanism... docname = file_strip_ext(filename) format = filename[len(docname):] if format and format[0] != '.': format = '.' + format else: docname = args['docname'] if not format: format = args['format'] if not version: version = args['version'] # version could be either empty, or all or an integer try: int(version) except ValueError: if version != 'all': version = '' display_hidden = acc_authorize_action(user_info, 'fulltext')[0] == 0 if version != 'all': # search this filename in the complete list of files for doc in bibarchive.list_bibdocs(): if docname == doc.get_docname(): try: docfile = doc.get_file(format, version) except InvenioWebSubmitFileError, msg: register_exception(req=req, alert_admin=True) if docfile.get_status() == '': # The file is not resticted, let's check for # collection restriction then. (auth_code, auth_message) = check_user_can_view_record(user_info, self.recid) if auth_code: return warningMsg(_("The collection to which this file belong is restricted: ") + auth_message, req, CFG_SITE_NAME, ln) else: # The file is probably restricted on its own. # Let's check for proper authorization then (auth_code, auth_message) = docfile.is_restricted(req) if auth_code != 0: return warningMsg(_("This file is restricted: ") + auth_message, req, CFG_SITE_NAME, ln) if display_hidden or not docfile.hidden_p(): if not readonly: ip = str(req.get_remote_host(apache.REMOTE_NOLOOKUP)) res = doc.register_download(ip, version, format, uid) try: return docfile.stream(req) except InvenioWebSubmitFileError, msg: register_exception(req=req, alert_admin=True) return warningMsg(_("An error has happened in trying to stream the request file."), req, CFG_SITE_NAME, ln) else: warn = print_warning(_("The requested file is hidden and you don't have the proper rights to access it.")) elif doc.get_icon() is not None and doc.get_icon().docname == file_strip_ext(filename): icon = doc.get_icon() try: iconfile = icon.get_file(format, version) except InvenioWebSubmitFileError, msg: register_exception(req=req, alert_admin=True) return warningMsg(_("An error has happened in trying to retrieve the corresponding icon."), req, CFG_SITE_NAME, ln) if iconfile.get_status() == '': # The file is not resticted, let's check for # collection restriction then. (auth_code, auth_message) = check_user_can_view_record(user_info, self.recid) if auth_code: return stream_restricted_icon(req) else: # The file is probably restricted on its own. # Let's check for proper authorization then (auth_code, auth_message) = iconfile.is_restricted(req) if auth_code != 0: return stream_restricted_icon(req) if not readonly: ip = str(req.get_remote_host(apache.REMOTE_NOLOOKUP)) res = doc.register_download(ip, version, format, uid) try: return iconfile.stream(req) except InvenioWebSubmitFileError, msg: register_exception(req=req, alert_admin=True) return warningMsg(_("An error has happened in trying to stream the corresponding icon."), req, CFG_SITE_NAME, ln) if docname and format and display_hidden: req.status = apache.HTTP_NOT_FOUND warn = print_warning(_("Requested file does not seem to exist.")) else: warn = '' filelist = bibarchive.display("", version, ln=ln, verbose=verbose, display_hidden=display_hidden) t = warn + websubmit_templates.tmpl_filelist( ln=ln, recid=self.recid, docname=args['docname'], version=version, filelist=filelist) cc = guess_primary_collection_of_a_record(self.recid) unordered_tabs = get_detailed_page_tabs(get_colID(cc), self.recid, ln) ordered_tabs_id = [(tab_id, values['order']) for (tab_id, values) in unordered_tabs.iteritems()] ordered_tabs_id.sort(lambda x,y: cmp(x[1],y[1])) link_ln = '' if ln != CFG_SITE_LANG: link_ln = '?ln=%s' % ln tabs = [(unordered_tabs[tab_id]['label'], \ '%s/record/%s/%s%s' % (CFG_SITE_URL, self.recid, tab_id, link_ln), \ tab_id == 'files', unordered_tabs[tab_id]['enabled']) \ for (tab_id, order) in ordered_tabs_id if unordered_tabs[tab_id]['visible'] == True] top = webstyle_templates.detailed_record_container_top(self.recid, tabs, args['ln']) bottom = webstyle_templates.detailed_record_container_bottom(self.recid, tabs, args['ln']) title, description, keywords = websearch_templates.tmpl_record_page_header_content(req, self.recid, args['ln']) return pageheaderonly(title=title, - navtrail=create_navtrail_links(cc=cc, as=0, ln=ln) + \ + navtrail=create_navtrail_links(cc=cc, aas=0, ln=ln) + \ ''' > <a class="navtrail" href="%s/record/%s">%s</a> > %s''' % \ (CFG_SITE_URL, self.recid, title, _("Access to Fulltext")), description="", keywords="keywords", uid=uid, language=ln, req=req, navmenuid='search', navtrail_append_title_p=0) + \ websearch_templates.tmpl_search_pagestart(ln) + \ top + t + bottom + \ websearch_templates.tmpl_search_pageend(ln) + \ pagefooteronly(lastupdated=__lastupdated__, language=ln, req=req) return getfile, [] def __call__(self, req, form): """Called in case of URLs like /record/123/files without trailing slash. """ args = wash_urlargd(form, websubmit_templates.files_default_urlargd) ln = args['ln'] link_ln = '' if ln != CFG_SITE_LANG: link_ln = '?ln=%s' % ln return redirect_to_url(req, '%s/record/%s/files/%s' % (CFG_SITE_URL, self.recid, link_ln)) def websubmit_legacy_getfile(req, form): """ Handle legacy /getfile.py URLs """ args = wash_urlargd(form, { 'recid': (int, 0), 'docid': (int, 0), 'version': (str, ''), 'name': (str, ''), 'format': (str, ''), 'ln' : (str, CFG_SITE_LANG) }) _ = gettext_set_language(args['ln']) def _getfile_py(req, recid=0, docid=0, version="", name="", format="", ln=CFG_SITE_LANG): if not recid: ## Let's obtain the recid from the docid if docid: try: bibdoc = BibDoc(docid=docid) recid = bibdoc.get_recid() except InvenioWebSubmitFileError, e: return warningMsg(_("An error has happened in trying to retrieve the requested file."), req, CFG_SITE_NAME, ln) else: return warningMsg(_('Not enough information to retrieve the document'), req, CFG_SITE_NAME, ln) else: if not name and docid: ## Let's obtain the name from the docid try: bibdoc = BibDoc(docid) name = bibdoc.get_docname() except InvenioWebSubmitFileError, e: return warningMsg(_("An error has happened in trying to retrieving the requested file."), req, CFG_SITE_NAME, ln) format = normalize_format(format) redirect_to_url(req, '%s/record/%s/files/%s%s?ln=%s%s' % (CFG_SITE_URL, recid, name, format, ln, version and 'version=%s' % version or ''), apache.HTTP_MOVED_PERMANENTLY) return _getfile_py(req, **args) # -------------------------------------------------- from invenio.websubmit_engine import home, action, interface, endaction class WebInterfaceSubmitPages(WebInterfaceDirectory): _exports = ['summary', 'sub', 'direct', '', 'attachfile'] def attachfile(self, req, form): """ Process requests received from FCKeditor to upload files. If the uploaded file is an image, create an icon version """ if not fckeditor_available: return apache.HTTP_NOT_FOUND if not form.has_key('NewFile') or \ not form.get('type', None) in \ ['File', 'Image', 'Flash', 'Media']: return apache.HTTP_NOT_FOUND uid = getUid(req) # URL where the file can be fetched after upload user_files_path = '%(CFG_SITE_URL)s/submit/getattachedfile/%(uid)s' % \ {'uid': uid, 'CFG_SITE_URL': CFG_SITE_URL} # Path to directory where uploaded files are saved user_files_absolute_path = '%(CFG_PREFIX)s/var/tmp/attachfile/%(uid)s' % \ {'uid': uid, 'CFG_PREFIX': CFG_PREFIX} try: os.makedirs(user_files_absolute_path) except: pass # Create a Connector instance to handle the request conn = FCKeditorConnectorInvenio(form, recid=-1, uid=uid, allowed_commands=['QuickUpload'], allowed_types = ['File', 'Image', 'Flash', 'Media'], user_files_path = user_files_path, user_files_absolute_path = user_files_absolute_path) user_info = collect_user_info(req) (auth_code, auth_msg) = acc_authorize_action(user_info, 'attachsubmissionfile') if user_info['email'] == 'guest' and not user_info['apache_user']: # User is guest: must login prior to upload data = conn.sendUploadResults(1, '', '', 'Please login before uploading file.') elif auth_code: # User cannot submit data = conn.sendUploadResults(1, '', '', 'Sorry, you are not allowed to submit files.') else: # Process the upload and get the response data = conn.doResponse() # At this point, the file has been uploaded. The FCKeditor # submit the image in form['NewFile']. However, the image # might have been renamed in between by the FCK connector on # the server side, by appending (%04d) at the end of the base # name. Retrieve that file uploaded_file_path = os.path.join(user_files_absolute_path, form['type'].lower(), form['NewFile'].filename) uploaded_file_path = retrieve_most_recent_attached_file(uploaded_file_path) uploaded_file_name = os.path.basename(uploaded_file_path) # Create an icon if form.get('type','') == 'Image': try: (icon_path, icon_name) = create_icon( { 'input-file' : uploaded_file_path, 'icon-name' : os.path.splitext(uploaded_file_name)[0], 'icon-file-format' : os.path.splitext(uploaded_file_name)[1][1:] or 'gif', 'multipage-icon' : False, 'multipage-icon-delay' : 100, 'icon-scale' : "300>", # Resize only if width > 300 'verbosity' : 0, }) # Move original file to /original dir, and replace it with icon file original_user_files_absolute_path = os.path.join(user_files_absolute_path, 'image', 'original') if not os.path.exists(original_user_files_absolute_path): # Create /original dir if needed os.mkdir(original_user_files_absolute_path) os.rename(uploaded_file_path, original_user_files_absolute_path + os.sep + uploaded_file_name) os.rename(icon_path + os.sep + icon_name, uploaded_file_path) except InvenioWebSubmitIconCreatorError, e: pass # Transform the headers into something ok for mod_python for header in conn.headers: if not header is None: if header[0] == 'Content-Type': req.content_type = header[1] else: req.headers_out[header[0]] = header[1] # Send our response req.send_http_header() req.write(data) def _lookup(self, component, path): """ This handler is invoked for the dynamic URLs (for getting and putting attachments) Eg: /submit/getattachedfile/41336978/image/myfigure.png /submit/attachfile/41336978/image/myfigure.png """ if component == 'getattachedfile' and len(path) > 2: uid = path[0] # uid of the submitter file_type = path[1] # file, image, flash or media (as # defined by FCKeditor) if file_type in ['file', 'image', 'flash', 'media']: file_name = '/'.join(path[2:]) # the filename def answer_get(req, form): """Accessing files attached to submission.""" form['file'] = file_name form['type'] = file_type form['uid'] = uid return self.getattachedfile(req, form) return answer_get, [] # All other cases: file not found return None, [] def getattachedfile(self, req, form): """ Returns a file uploaded to the submission 'drop box' by the FCKeditor. """ argd = wash_urlargd(form, {'file': (str, None), 'type': (str, None), 'uid': (int, 0)}) # Can user view this record, i.e. can user access its # attachments? uid = getUid(req) user_info = collect_user_info(req) if not argd['file'] is None: # Prepare path to file on disk. Normalize the path so that # ../ and other dangerous components are removed. path = os.path.abspath(CFG_PREFIX + '/var/tmp/attachfile/' + \ '/' + str(argd['uid']) + \ '/' + argd['type'] + '/' + argd['file']) # Check that we are really accessing attachements # directory, for the declared record. if path.startswith(CFG_PREFIX + '/var/tmp/attachfile/') and os.path.exists(path): return stream_file(req, path) # Send error 404 in all other cases return(apache.HTTP_NOT_FOUND) def direct(self, req, form): """Directly redirected to an initialized submission.""" args = wash_urlargd(form, {'sub': (str, ''), 'access' : (str, '')}) sub = args['sub'] access = args['access'] ln = args['ln'] _ = gettext_set_language(ln) uid = getUid(req) if uid == -1 or CFG_ACCESS_CONTROL_LEVEL_SITE >= 1: return page_not_authorized(req, "direct", navmenuid='submit') myQuery = req.args if not sub: return warningMsg(_("Sorry, 'sub' parameter missing..."), req, ln=ln) res = run_sql("SELECT docname,actname FROM sbmIMPLEMENT WHERE subname=%s", (sub,)) if not res: return warningMsg(_("Sorry. Cannot analyse parameter"), req, ln=ln) else: # get document type doctype = res[0][0] # get action name action = res[0][1] # retrieve other parameter values params = dict(form) # find existing access number if not access: # create 'unique' access number pid = os.getpid() now = time.time() access = "%i_%s" % (now,pid) # retrieve 'dir' value res = run_sql ("SELECT dir FROM sbmACTION WHERE sactname=%s", (action,)) dir = res[0][0] mainmenu = req.headers_in.get('referer') params['access'] = access params['act'] = action params['doctype'] = doctype params['startPg'] = '1' params['mainmenu'] = mainmenu params['ln'] = ln params['indir'] = dir url = "%s/submit?%s" % (CFG_SITE_URL, urlencode(params)) redirect_to_url(req, url) def sub(self, req, form): """DEPRECATED: /submit/sub is deprecated now, so raise email to the admin (but allow submission to continue anyway)""" args = wash_urlargd(form, {'password': (str, '')}) uid = getUid(req) if uid == -1 or CFG_ACCESS_CONTROL_LEVEL_SITE >= 1: return page_not_authorized(req, "../sub/", navmenuid='submit') try: raise DeprecationWarning, 'submit/sub handler has been used. Please use submit/direct. e.g. "submit/sub?RN=123@SBIFOO" -> "submit/direct?RN=123&sub=SBIFOO"' except DeprecationWarning: register_exception(req=req, alert_admin=True) ln = args['ln'] _ = gettext_set_language(ln) #DEMOBOO_RN=DEMO-BOOK-2008-001&ln=en&password=1223993532.26572%40APPDEMOBOO params = dict(form) password = args['password'] if password: del params['password'] if "@" in password: params['access'], params['sub'] = password.split('@', 1) else: params['sub'] = password else: args = str(req.args).split('@') if len(args) > 1: params = {'sub' : args[-1]} args = '@'.join(args[:-1]) params.update(cgi.parse_qs(args)) else: return warningMsg(_("Sorry, invalid URL..."), req, ln=ln) url = "%s/submit/direct?%s" % (CFG_SITE_URL, urlencode(params, doseq=True)) redirect_to_url(req, url) def summary(self, req, form): args = wash_urlargd(form, { 'doctype': (str, ''), 'act': (str, ''), 'access': (str, ''), 'indir': (str, '')}) uid = getUid(req) if uid == -1 or CFG_ACCESS_CONTROL_LEVEL_SITE >= 1: return page_not_authorized(req, "../summary", navmenuid='submit') t="" curdir = os.path.join(CFG_WEBSUBMIT_STORAGEDIR, args['indir'], args['doctype'], args['access']) try: assert(curdir == os.path.abspath(curdir)) except AssertionError: register_exception(req=req, alert_admin=True, prefix='Possible cracking tentative: indir="%s", doctype="%s", access="%s"' % (args['indir'], args['doctype'], args['access'])) return warningMsg("Invalid parameters") subname = "%s%s" % (args['act'], args['doctype']) res = run_sql("select sdesc,fidesc,pagenb,level from sbmFIELD where subname=%s " "order by pagenb,fieldnb", (subname,)) nbFields = 0 values = [] for arr in res: if arr[0] != "": val = { 'mandatory' : (arr[3] == 'M'), 'value' : '', 'page' : arr[2], 'name' : arr[0], } if os.path.exists(os.path.join(curdir, curdir,arr[1])): fd = open(os.path.join(curdir, arr[1]),"r") value = fd.read() fd.close() value = value.replace("\n"," ") value = value.replace("Select:","") else: value = "" val['value'] = value values.append(val) return websubmit_templates.tmpl_submit_summary( ln = args['ln'], values = values, ) def index(self, req, form): args = wash_urlargd(form, { 'c': (str, CFG_SITE_NAME), 'doctype': (str, ''), 'act': (str, ''), 'startPg': (str, "1"), 'access': (str, ''), 'mainmenu': (str, ''), 'fromdir': (str, ''), 'nextPg': (str, ''), 'nbPg': (str, ''), 'curpage': (str, '1'), 'step': (str, '0'), 'mode': (str, 'U'), }) req.form = form ## Strip whitespace from beginning and end of doctype and action: args["doctype"] = args["doctype"].strip() args["act"] = args["act"].strip() def _index(req, c, ln, doctype, act, startPg, access, mainmenu, fromdir, nextPg, nbPg, curpage, step, mode): uid = getUid(req) if isGuestUser(uid): return redirect_to_url(req, "%s/youraccount/login%s" % ( CFG_SITE_SECURE_URL, make_canonical_urlargd({ 'referer' : CFG_SITE_URL + req.unparsed_uri, 'ln' : args['ln']}, {}))) if uid == -1 or CFG_ACCESS_CONTROL_LEVEL_SITE >= 1: return page_not_authorized(req, "../submit", navmenuid='submit') if doctype=="": return home(req,c,ln) elif act=="": return action(req,c,ln,doctype) elif int(step)==0: return interface(req, c, ln, doctype, act, startPg, access, mainmenu, fromdir, nextPg, nbPg, curpage) else: return endaction(req, c, ln, doctype, act, startPg, access,mainmenu, fromdir, nextPg, nbPg, curpage, step, mode) return _index(req, **args) # Answer to both /submit/ and /submit __call__ = index def errorMsg(title, req, c=None, ln=CFG_SITE_LANG): # load the right message language _ = gettext_set_language(ln) if c is None: c = CFG_SITE_NAME_INTL.get(ln, CFG_SITE_NAME) return page(title = _("Error"), body = create_error_box(req, title=title, verbose=0, ln=ln), description="%s - Internal Error" % c, keywords="%s, Internal Error" % c, uid = getUid(req), language=ln, req=req, navmenuid='submit') def warningMsg(title, req, c=None, ln=CFG_SITE_LANG): # load the right message language _ = gettext_set_language(ln) if c is None: c = CFG_SITE_NAME_INTL.get(ln, CFG_SITE_NAME) return page(title = _("Warning"), body = title, description="%s - Internal Error" % c, keywords="%s, Internal Error" % c, uid = getUid(req), language=ln, req=req, navmenuid='submit') def print_warning(msg, type='', prologue='<br />', epilogue='<br />'): """Prints warning message and flushes output.""" if msg: return websubmit_templates.tmpl_print_warning( msg = msg, type = type, prologue = prologue, epilogue = epilogue, ) else: return '' def retrieve_most_recent_attached_file(file_path): """ Retrieve the latest file that has been uploaded with the FCKeditor. This is the only way to retrieve files that the FCKeditor has renamed after the upload. Eg: 'prefix/image.jpg' was uploaded but did already exist. FCKeditor silently renamed it to 'prefix/image(0001).jpg': >>> retrieve_most_recent_attached_file('prefix/image.jpg') 'prefix/image(0001).jpg' """ (base_path, filename) = os.path.split(file_path) base_name = os.path.splitext(filename)[0] file_ext = os.path.splitext(filename)[1][1:] most_recent_filename = filename for i in range(1, 10000): # FCKeditor renames up to 9999. After it overwrites. possible_filename = "%s(%04d).%s" % \ (base_name, i, file_ext) if os.path.exists(base_path + os.sep + possible_filename): most_recent_filename = possible_filename else: break return os.path.join(base_path, most_recent_filename)