diff --git a/modules/bibrank/lib/bibrank_citation_searcher.py b/modules/bibrank/lib/bibrank_citation_searcher.py index 5635bf6d8..be68b130a 100644 --- a/modules/bibrank/lib/bibrank_citation_searcher.py +++ b/modules/bibrank/lib/bibrank_citation_searcher.py @@ -1,260 +1,285 @@ # -*- coding: utf-8 -*- ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. __revision__ = "$Id$" import re from invenio.dbquery import run_sql, get_table_update_time, OperationalError, \ deserialize_via_marshal from invenio.intbitset import intbitset from invenio.data_cacher import DataCacher class CitationDictsDataCacher(DataCacher): """ Cache holding all citation dictionaries (citationdict, reversedict, selfcitdict, selfcitedbydict). """ def __init__(self): def cache_filler(): alldicts = {} try: res = run_sql("SELECT object_name,object_value FROM rnkCITATIONDATA") except OperationalError: # database problems, return empty cache return {} for row in res: object_name = row[0] object_value = row[1] try: object_value_dict = deserialize_via_marshal(object_value) except: object_value_dict = {} alldicts[object_name] = object_value_dict if object_name == 'citationdict': # for cited:M->N queries, it is interesting to cache also # some preprocessed citationdict: alldicts['citationdict_keys'] = object_value_dict.keys() alldicts['citationdict_keys_intbitset'] = intbitset(object_value_dict.keys()) return alldicts def timestamp_verifier(): res = run_sql("""SELECT DATE_FORMAT(last_updated, '%Y-%m-%d %H:%i:%s') FROM rnkMETHOD WHERE name='citation'""") return res[0][0] DataCacher.__init__(self, cache_filler, timestamp_verifier) try: cache_citation_dicts.is_ok_p except Exception: cache_citation_dicts = CitationDictsDataCacher() def get_citation_dict(dictname): """Return cached value of a citation dictionary. DICTNAME can be citationdict, reversedict, selfcitdict, selfcitedbydict. """ cache_citation_dicts.recreate_cache_if_needed() return cache_citation_dicts.cache.get(dictname, {}) def get_cited_by(recordid): """Return a list of records that cite recordid""" ret = [] cache_cited_by_dictionary = get_citation_dict("citationdict") if cache_cited_by_dictionary.has_key(recordid): ret = cache_cited_by_dictionary[recordid] return ret def get_cited_by_count(recordid): """Return how many records cite given RECORDID.""" cache_cited_by_dictionary = get_citation_dict("citationdict") return len(cache_cited_by_dictionary.get(recordid, [])) def get_records_with_num_cites(numstr, allrecs = intbitset([])): """Return an intbitset of record IDs that are cited X times, X defined in numstr. Warning: numstr is string and may not be numeric! It can be 10,0->100 etc """ cache_cited_by_dictionary = get_citation_dict("citationdict") cache_cited_by_dictionary_keys = get_citation_dict("citationdict_keys") cache_cited_by_dictionary_keys_intbitset = get_citation_dict("citationdict_keys_intbitset") matches = intbitset([]) #once again, check that the parameter is a string if not (type(numstr) == type("thisisastring")): return intbitset([]) numstr = numstr.replace(" ",'') numstr = numstr.replace('"','') num = 0 #first, check if numstr is just a number singlenum = re.findall("(^\d+$)", numstr) if singlenum: num = int(singlenum[0]) if num == 0: #we return recids that are not in keys return allrecs - cache_cited_by_dictionary_keys_intbitset for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) == num: matches.add(k) return matches #try to get 1->10 or such firstsec = re.findall("(\d+)->(\d+)", numstr) if firstsec: first = 0 sec = -1 try: first = int(firstsec[0][0]) sec = int(firstsec[0][1]) except: return intbitset([]) if (first == 0): #start with those that have no cites.. matches = allrecs - cache_cited_by_dictionary_keys_intbitset if (first <= sec): for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) >= first: if len(li) <= sec: matches.add(k) return matches firstsec = re.findall("(\d+)\+", numstr) if firstsec: first = firstsec[0] for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) > int(first): matches.add(k) return matches def get_cited_by_list(recordlist): """Return a tuple of ([recid,list_of_citing_records],...) for all the records in recordlist. """ cache_cited_by_dictionary = get_citation_dict("citationdict") result = [] for recid in recordlist: result.append([recid, cache_cited_by_dictionary.get(recid, [])]) return result +def get_refersto_hitset(ahitset): + """ + Return a hitset of records that refers to (cite) some records from + the given ahitset. Useful for search engine's + refersto:author:ellis feature. + """ + cache_cited_by_dictionary = get_citation_dict("citationdict") + out = intbitset() + if ahitset: + for recid in ahitset: + out = out | intbitset(cache_cited_by_dictionary.get(recid, [])) + return out + +def get_citedby_hitset(ahitset): + """ + Return a hitset of records that are cited by records in the given + ahitset. Useful for search engine's citedby:author:ellis feature. + """ + cache_cited_by_dictionary = get_citation_dict("reversedict") + out = intbitset() + if ahitset: + for recid in ahitset: + out = out | intbitset(cache_cited_by_dictionary.get(recid, [])) + return out + def get_cited_by_weight(recordlist): """Return a tuple of ([recid,number_of_citing_records],...) for all the records in recordlist. """ cache_cited_by_dictionary = get_citation_dict("citationdict") result = [] for recid in recordlist: result.append([recid, len(cache_cited_by_dictionary.get(recid, []))]) return result def calculate_cited_by_list(record_id, sort_order="d"): """Return a tuple of ([recid,citation_weight],...) for all the - record in citing RECORD_ID. The resulting recids is sorted by + record citing RECORD_ID. The resulting recids is sorted by ascending/descending citation weights depending or SORT_ORDER. """ cache_cited_by_dictionary = get_citation_dict("citationdict") citation_list = [] result = [] # determine which record cite RECORD_ID: if cache_cited_by_dictionary: citation_list = cache_cited_by_dictionary.get(record_id, []) #add weights i.e. records that cite each of the entries in citation_list for c in citation_list: ccited = cache_cited_by_dictionary.get(c, []) result.append([c, len(ccited)]) # sort them: if result: if sort_order == "d": result.sort(lambda x, y: cmp(y[1], x[1])) else: result.sort(lambda x, y: cmp(x[1], y[1])) return result def get_author_cited_by(authorstring): """Return a list of doc ids [y1,y2,..] for the author given as param, such that y1,y2.. cite that author """ citations = [] res = run_sql("select hitlist from rnkAUTHORDATA where aterm=%s", (authorstring,)) if res and res[0] and res[0][0]: #has to be prepared for corrupted data! try: citations = deserialize_via_marshal(res[0][0]) except: citations = [] return citations def get_self_cited_by(record_id): """Return a list of doc ids [y1,y2,..] for the rec id x given as param, so that x cites y1,y2,.. and x and each y share an author """ cache_selfcit_dictionary = get_citation_dict("selfcitdict") result = [] if cache_selfcit_dictionary and cache_selfcit_dictionary.has_key(record_id): result.extend(cache_selfcit_dictionary[record_id]) if not result: return None return result def get_self_cited_in(record_id): """Return a list of doc ids [y1,y2,..] for the rec id x given as param, so that x is cited in y1,y2,.. and x and each y share an author """ cache_selfcitedby_dictionary = get_citation_dict("selfcitedbydict") result = [] if cache_selfcitedby_dictionary and cache_selfcitedby_dictionary.has_key(record_id): result.extend(cache_selfcitedby_dictionary[record_id]) if not result: return None return result def calculate_co_cited_with_list(record_id, sort_order="d"): """Return a tuple of ([recid,co-cited weight],...) for records that are co-cited with RECORD_ID. The resulting recids is sorted by ascending/descending citation weights depending or SORT_ORDER. """ cache_cited_by_dictionary = get_citation_dict("citationdict") cache_reference_list_dictionary = get_citation_dict("reversedict") result = [] result_intermediate = {} citation_list = [] if cache_cited_by_dictionary: citation_list = cache_cited_by_dictionary.get(record_id, []) for cit_id in citation_list: reference_list = [] if cache_reference_list_dictionary: reference_list = cache_reference_list_dictionary.get(cit_id, []) for ref_id in reference_list: if not result_intermediate.has_key(ref_id): result_intermediate[ref_id] = 1 else: result_intermediate[ref_id] += 1 for key, value in result_intermediate.iteritems(): if not (key==record_id): result.append([key, value]) if result: if sort_order == "d": result.sort(lambda x, y: cmp(y[1], x[1])) else: result.sort(lambda x, y: cmp(x[1], y[1])) return result diff --git a/modules/miscutil/sql/tabfill.sql b/modules/miscutil/sql/tabfill.sql index 7731608a3..d4a50bffc 100644 --- a/modules/miscutil/sql/tabfill.sql +++ b/modules/miscutil/sql/tabfill.sql @@ -1,597 +1,599 @@ -- $Id$ -- This file is part of CDS Invenio. -- Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. -- -- CDS Invenio is free software; you can redistribute it and/or -- modify it under the terms of the GNU General Public License as -- published by the Free Software Foundation; either version 2 of the -- License, or (at your option) any later version. -- -- CDS Invenio is distributed in the hope that it will be useful, but -- WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with CDS Invenio; if not, write to the Free Software Foundation, Inc., -- 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -- Fill Invenio configuration tables with defaults suitable for any site. INSERT INTO rnkMETHOD (id,name,last_updated) VALUES (1,'wrd','0000-00-00 00:00:00'); INSERT INTO collection_rnkMETHOD (id_collection,id_rnkMETHOD,score) VALUES (1,1,100); INSERT INTO rnkCITATIONDATA VALUES (1,'citationdict',NULL,'0000-00-00'); INSERT INTO rnkCITATIONDATA VALUES (2,'reversedict',NULL,'0000-00-00'); INSERT INTO rnkCITATIONDATA VALUES (3,'selfcitdict',NULL,'0000-00-00'); INSERT INTO rnkCITATIONDATA VALUES (4,'selfcitedbydict',NULL,'0000-00-00'); INSERT INTO field VALUES (1,'any field','anyfield'); INSERT INTO field VALUES (2,'title','title'); INSERT INTO field VALUES (3,'author','author'); INSERT INTO field VALUES (4,'abstract','abstract'); INSERT INTO field VALUES (5,'keyword','keyword'); INSERT INTO field VALUES (6,'report number','reportnumber'); INSERT INTO field VALUES (7,'subject','subject'); INSERT INTO field VALUES (8,'reference','reference'); INSERT INTO field VALUES (9,'fulltext','fulltext'); INSERT INTO field VALUES (10,'collection','collection'); INSERT INTO field VALUES (11,'division','division'); INSERT INTO field VALUES (12,'year','year'); INSERT INTO field VALUES (13,'experiment','experiment'); INSERT INTO field VALUES (14,'record ID','recid'); INSERT INTO field VALUES (15,'isbn','isbn'); INSERT INTO field VALUES (16,'issn','issn'); INSERT INTO field VALUES (17,'coden','coden'); -- INSERT INTO field VALUES (18,'doi','doi'); INSERT INTO field VALUES (19,'journal','journal'); INSERT INTO field VALUES (20,'collaboration','collaboration'); INSERT INTO field VALUES (21,'affiliation','affiliation'); INSERT INTO field VALUES (22,'exact author','exactauthor'); INSERT INTO field VALUES (23,'date created','datecreated'); INSERT INTO field VALUES (24,'date modified','datemodified'); +INSERT INTO field VALUES (25,'refers to','refersto'); +INSERT INTO field VALUES (26,'cited by','citedby'); INSERT INTO field_tag VALUES (1,100,10); INSERT INTO field_tag VALUES (1,102,10); INSERT INTO field_tag VALUES (1,103,10); INSERT INTO field_tag VALUES (1,104,10); INSERT INTO field_tag VALUES (1,105,10); INSERT INTO field_tag VALUES (1,106,10); INSERT INTO field_tag VALUES (1,107,10); INSERT INTO field_tag VALUES (1,108,10); INSERT INTO field_tag VALUES (1,109,10); INSERT INTO field_tag VALUES (1,110,10); INSERT INTO field_tag VALUES (1,111,10); INSERT INTO field_tag VALUES (1,112,10); INSERT INTO field_tag VALUES (1,113,10); INSERT INTO field_tag VALUES (1,114,10); INSERT INTO field_tag VALUES (1,16,10); INSERT INTO field_tag VALUES (1,17,10); INSERT INTO field_tag VALUES (1,18,10); INSERT INTO field_tag VALUES (1,19,10); INSERT INTO field_tag VALUES (1,20,10); INSERT INTO field_tag VALUES (1,21,10); INSERT INTO field_tag VALUES (1,22,10); INSERT INTO field_tag VALUES (1,23,10); INSERT INTO field_tag VALUES (1,24,10); INSERT INTO field_tag VALUES (1,25,10); INSERT INTO field_tag VALUES (1,26,10); INSERT INTO field_tag VALUES (1,27,10); INSERT INTO field_tag VALUES (1,28,10); INSERT INTO field_tag VALUES (1,29,10); INSERT INTO field_tag VALUES (1,30,10); INSERT INTO field_tag VALUES (1,31,10); INSERT INTO field_tag VALUES (1,32,10); INSERT INTO field_tag VALUES (1,33,10); INSERT INTO field_tag VALUES (1,34,10); INSERT INTO field_tag VALUES (1,35,10); INSERT INTO field_tag VALUES (1,36,10); INSERT INTO field_tag VALUES (1,37,10); INSERT INTO field_tag VALUES (1,38,10); INSERT INTO field_tag VALUES (1,39,10); INSERT INTO field_tag VALUES (1,40,10); INSERT INTO field_tag VALUES (1,41,10); INSERT INTO field_tag VALUES (1,42,10); INSERT INTO field_tag VALUES (1,43,10); INSERT INTO field_tag VALUES (1,44,10); INSERT INTO field_tag VALUES (1,45,10); INSERT INTO field_tag VALUES (1,46,10); INSERT INTO field_tag VALUES (1,47,10); INSERT INTO field_tag VALUES (1,48,10); INSERT INTO field_tag VALUES (1,49,10); INSERT INTO field_tag VALUES (1,50,10); INSERT INTO field_tag VALUES (1,51,10); INSERT INTO field_tag VALUES (1,52,10); INSERT INTO field_tag VALUES (1,53,10); INSERT INTO field_tag VALUES (1,54,10); INSERT INTO field_tag VALUES (1,55,10); INSERT INTO field_tag VALUES (1,56,10); INSERT INTO field_tag VALUES (1,57,10); INSERT INTO field_tag VALUES (1,58,10); INSERT INTO field_tag VALUES (1,59,10); INSERT INTO field_tag VALUES (1,60,10); INSERT INTO field_tag VALUES (1,61,10); INSERT INTO field_tag VALUES (1,62,10); INSERT INTO field_tag VALUES (1,63,10); INSERT INTO field_tag VALUES (1,64,10); INSERT INTO field_tag VALUES (1,65,10); INSERT INTO field_tag VALUES (1,66,10); INSERT INTO field_tag VALUES (1,67,10); INSERT INTO field_tag VALUES (1,68,10); INSERT INTO field_tag VALUES (1,69,10); INSERT INTO field_tag VALUES (1,70,10); INSERT INTO field_tag VALUES (1,71,10); INSERT INTO field_tag VALUES (1,72,10); INSERT INTO field_tag VALUES (1,73,10); INSERT INTO field_tag VALUES (1,74,10); INSERT INTO field_tag VALUES (1,75,10); INSERT INTO field_tag VALUES (1,76,10); INSERT INTO field_tag VALUES (1,77,10); INSERT INTO field_tag VALUES (1,78,10); INSERT INTO field_tag VALUES (1,79,10); INSERT INTO field_tag VALUES (1,80,10); INSERT INTO field_tag VALUES (1,81,10); INSERT INTO field_tag VALUES (1,82,10); INSERT INTO field_tag VALUES (1,83,10); INSERT INTO field_tag VALUES (1,84,10); INSERT INTO field_tag VALUES (1,85,10); INSERT INTO field_tag VALUES (1,86,10); INSERT INTO field_tag VALUES (1,87,10); INSERT INTO field_tag VALUES (1,88,10); INSERT INTO field_tag VALUES (1,89,10); INSERT INTO field_tag VALUES (1,90,10); INSERT INTO field_tag VALUES (1,91,10); INSERT INTO field_tag VALUES (1,92,10); INSERT INTO field_tag VALUES (1,93,10); INSERT INTO field_tag VALUES (1,94,10); INSERT INTO field_tag VALUES (1,95,10); INSERT INTO field_tag VALUES (1,96,10); INSERT INTO field_tag VALUES (1,97,10); INSERT INTO field_tag VALUES (1,98,10); INSERT INTO field_tag VALUES (1,99,10); INSERT INTO field_tag VALUES (1,122,10); INSERT INTO field_tag VALUES (1,123,10); INSERT INTO field_tag VALUES (1,124,10); INSERT INTO field_tag VALUES (1,125,10); INSERT INTO field_tag VALUES (1,126,10); INSERT INTO field_tag VALUES (1,127,10); INSERT INTO field_tag VALUES (1,128,10); INSERT INTO field_tag VALUES (1,129,10); INSERT INTO field_tag VALUES (1,130,10); INSERT INTO field_tag VALUES (10,11,100); INSERT INTO field_tag VALUES (11,14,100); INSERT INTO field_tag VALUES (12,15,10); INSERT INTO field_tag VALUES (13,116,10); INSERT INTO field_tag VALUES (2,3,100); INSERT INTO field_tag VALUES (2,4,90); INSERT INTO field_tag VALUES (3,1,100); INSERT INTO field_tag VALUES (3,2,90); INSERT INTO field_tag VALUES (4,5,100); INSERT INTO field_tag VALUES (5,6,100); INSERT INTO field_tag VALUES (6,7,30); INSERT INTO field_tag VALUES (6,8,10); INSERT INTO field_tag VALUES (6,9,20); INSERT INTO field_tag VALUES (7,12,100); INSERT INTO field_tag VALUES (7,13,90); INSERT INTO field_tag VALUES (8,10,100); INSERT INTO field_tag VALUES (9,115,100); INSERT INTO field_tag VALUES (14,117,100); INSERT INTO field_tag VALUES (15,118,100); INSERT INTO field_tag VALUES (16,119,100); INSERT INTO field_tag VALUES (17,120,100); -- INSERT INTO field_tag VALUES (18,121,100); INSERT INTO field_tag VALUES (19,131,100); INSERT INTO field_tag VALUES (20,132,100); INSERT INTO field_tag VALUES (21,133,100); INSERT INTO field_tag VALUES (21,134,90); INSERT INTO field_tag VALUES (22,1,100); INSERT INTO field_tag VALUES (22,2,90); INSERT INTO format VALUES (1,'HTML brief','hb', 'HTML brief output format, used for search results pages.', 'text/html', 1); INSERT INTO format VALUES (2,'HTML detailed','hd', 'HTML detailed output format, used for Detailed record pages.', 'text/html', 1); INSERT INTO format VALUES (3,'MARC','hm', 'HTML MARC.', 'text/html', 1); INSERT INTO format VALUES (4,'Dublin Core','xd', 'XML Dublin Core.', 'text/xml', 1); INSERT INTO format VALUES (5,'MARCXML','xm', 'XML MARC.', 'text/xml', 1); INSERT INTO format VALUES (6,'portfolio','hp', 'HTML portfolio-style output format for photos.', 'text/html', 1); INSERT INTO format VALUES (7,'photo captions only','hc', 'HTML caption-only output format for photos.', 'text/html', 1); INSERT INTO format VALUES (8,'BibTeX','hx', 'BibTeX.', 'text/html', 1); INSERT INTO format VALUES (9,'EndNote','xe', 'XML EndNote.', 'text/xml', 1); INSERT INTO format VALUES (10,'NLM','xn', 'XML NLM.', 'text/xml', 1); INSERT INTO format VALUES (11,'Excel','excel', 'Excel csv output', 'application/ms-excel', 0); INSERT INTO format VALUES (12,'HTML similarity','hs', 'Very short HTML output for similarity box (people also viewed..).', 'text/html', 0); INSERT INTO format VALUES (13,'RSS','xr', 'RSS.', 'text/xml', 0); INSERT INTO format VALUES (14,'OAI DC','xoaidc', 'OAI DC.', 'text/xml', 0); INSERT INTO format VALUES (15,'File mini-panel', 'hdfile', 'Used to show fulltext files in mini-panel of detailed record pages.', 'text/html', 0); INSERT INTO format VALUES (16,'Actions mini-panel', 'hdact', 'Used to display actions in mini-panel of detailed record pages.', 'text/html', 0); INSERT INTO format VALUES (17,'References tab', 'hdref', 'Display record references in References tab.', 'text/html', 0); INSERT INTO format VALUES (18,'HTML citesummary','hcs', 'HTML cite summary format, used for search results pages.', 'text/html', 1); INSERT INTO format VALUES (19,'RefWorks','xw', 'RefWorks.', 'text/xml', 1); INSERT INTO format VALUES (20,'MODS', 'xo', 'Metadata Object Description Schema', 'application/xml', 1); INSERT INTO tag VALUES (1,'first author name','100__a'); INSERT INTO tag VALUES (2,'additional author name','700__a'); INSERT INTO tag VALUES (3,'main title','245__%'); INSERT INTO tag VALUES (4,'additional title','246__%'); INSERT INTO tag VALUES (5,'abstract','520__%'); INSERT INTO tag VALUES (6,'keyword','6531_a'); INSERT INTO tag VALUES (7,'primary report number','037__a'); INSERT INTO tag VALUES (8,'additional report number','088__a'); INSERT INTO tag VALUES (9,'added report number','909C0r'); INSERT INTO tag VALUES (10,'reference','999C5%'); INSERT INTO tag VALUES (11,'collection identifier','980__%'); INSERT INTO tag VALUES (12,'main subject','65017a'); INSERT INTO tag VALUES (13,'additional subject','65027a'); INSERT INTO tag VALUES (14,'division','909C0p'); INSERT INTO tag VALUES (15,'year','909C0y'); INSERT INTO tag VALUES (16,'00x','00%'); INSERT INTO tag VALUES (17,'01x','01%'); INSERT INTO tag VALUES (18,'02x','02%'); INSERT INTO tag VALUES (19,'03x','03%'); INSERT INTO tag VALUES (20,'lang','04%'); INSERT INTO tag VALUES (21,'05x','05%'); INSERT INTO tag VALUES (22,'06x','06%'); INSERT INTO tag VALUES (23,'07x','07%'); INSERT INTO tag VALUES (24,'08x','08%'); INSERT INTO tag VALUES (25,'09x','09%'); INSERT INTO tag VALUES (26,'10x','10%'); INSERT INTO tag VALUES (27,'11x','11%'); INSERT INTO tag VALUES (28,'12x','12%'); INSERT INTO tag VALUES (29,'13x','13%'); INSERT INTO tag VALUES (30,'14x','14%'); INSERT INTO tag VALUES (31,'15x','15%'); INSERT INTO tag VALUES (32,'16x','16%'); INSERT INTO tag VALUES (33,'17x','17%'); INSERT INTO tag VALUES (34,'18x','18%'); INSERT INTO tag VALUES (35,'19x','19%'); INSERT INTO tag VALUES (36,'20x','20%'); INSERT INTO tag VALUES (37,'21x','21%'); INSERT INTO tag VALUES (38,'22x','22%'); INSERT INTO tag VALUES (39,'23x','23%'); INSERT INTO tag VALUES (40,'24x','24%'); INSERT INTO tag VALUES (41,'25x','25%'); INSERT INTO tag VALUES (42,'internal','26%'); INSERT INTO tag VALUES (43,'27x','27%'); INSERT INTO tag VALUES (44,'28x','28%'); INSERT INTO tag VALUES (45,'29x','29%'); INSERT INTO tag VALUES (46,'pages','30%'); INSERT INTO tag VALUES (47,'31x','31%'); INSERT INTO tag VALUES (48,'32x','32%'); INSERT INTO tag VALUES (49,'33x','33%'); INSERT INTO tag VALUES (50,'34x','34%'); INSERT INTO tag VALUES (51,'35x','35%'); INSERT INTO tag VALUES (52,'36x','36%'); INSERT INTO tag VALUES (53,'37x','37%'); INSERT INTO tag VALUES (54,'38x','38%'); INSERT INTO tag VALUES (55,'39x','39%'); INSERT INTO tag VALUES (56,'40x','40%'); INSERT INTO tag VALUES (57,'41x','41%'); INSERT INTO tag VALUES (58,'42x','42%'); INSERT INTO tag VALUES (59,'43x','43%'); INSERT INTO tag VALUES (60,'44x','44%'); INSERT INTO tag VALUES (61,'45x','45%'); INSERT INTO tag VALUES (62,'46x','46%'); INSERT INTO tag VALUES (63,'47x','47%'); INSERT INTO tag VALUES (64,'48x','48%'); INSERT INTO tag VALUES (65,'series','49%'); INSERT INTO tag VALUES (66,'50x','50%'); INSERT INTO tag VALUES (67,'51x','51%'); INSERT INTO tag VALUES (68,'52x','52%'); INSERT INTO tag VALUES (69,'53x','53%'); INSERT INTO tag VALUES (70,'54x','54%'); INSERT INTO tag VALUES (71,'55x','55%'); INSERT INTO tag VALUES (72,'56x','56%'); INSERT INTO tag VALUES (73,'57x','57%'); INSERT INTO tag VALUES (74,'58x','58%'); INSERT INTO tag VALUES (75,'summary','59%'); INSERT INTO tag VALUES (76,'60x','60%'); INSERT INTO tag VALUES (77,'61x','61%'); INSERT INTO tag VALUES (78,'62x','62%'); INSERT INTO tag VALUES (79,'63x','63%'); INSERT INTO tag VALUES (80,'64x','64%'); INSERT INTO tag VALUES (81,'65x','65%'); INSERT INTO tag VALUES (82,'66x','66%'); INSERT INTO tag VALUES (83,'67x','67%'); INSERT INTO tag VALUES (84,'68x','68%'); INSERT INTO tag VALUES (85,'subject','69%'); INSERT INTO tag VALUES (86,'70x','70%'); INSERT INTO tag VALUES (87,'71x','71%'); INSERT INTO tag VALUES (88,'author-ad','72%'); INSERT INTO tag VALUES (89,'73x','73%'); INSERT INTO tag VALUES (90,'74x','74%'); INSERT INTO tag VALUES (91,'75x','75%'); INSERT INTO tag VALUES (92,'76x','76%'); INSERT INTO tag VALUES (93,'77x','77%'); INSERT INTO tag VALUES (94,'78x','78%'); INSERT INTO tag VALUES (95,'79x','79%'); INSERT INTO tag VALUES (96,'80x','80%'); INSERT INTO tag VALUES (97,'81x','81%'); INSERT INTO tag VALUES (98,'82x','82%'); INSERT INTO tag VALUES (99,'83x','83%'); INSERT INTO tag VALUES (100,'84x','84%'); INSERT INTO tag VALUES (101,'electr','85%'); INSERT INTO tag VALUES (102,'86x','86%'); INSERT INTO tag VALUES (103,'87x','87%'); INSERT INTO tag VALUES (104,'88x','88%'); INSERT INTO tag VALUES (105,'89x','89%'); INSERT INTO tag VALUES (106,'publication','90%'); INSERT INTO tag VALUES (107,'pub-conf-cit','91%'); INSERT INTO tag VALUES (108,'92x','92%'); INSERT INTO tag VALUES (109,'93x','93%'); INSERT INTO tag VALUES (110,'94x','94%'); INSERT INTO tag VALUES (111,'95x','95%'); INSERT INTO tag VALUES (112,'catinfo','96%'); INSERT INTO tag VALUES (113,'97x','97%'); INSERT INTO tag VALUES (114,'98x','98%'); INSERT INTO tag VALUES (115,'url','8564_u'); INSERT INTO tag VALUES (116,'experiment','909C0e'); INSERT INTO tag VALUES (117,'record ID','001'); INSERT INTO tag VALUES (118,'isbn','020__a'); INSERT INTO tag VALUES (119,'issn','022__a'); INSERT INTO tag VALUES (120,'coden','030__a'); -- INSERT INTO tag VALUES (121,'doi','773__a'); INSERT INTO tag VALUES (122,'850x','850%'); INSERT INTO tag VALUES (123,'851x','851%'); INSERT INTO tag VALUES (124,'852x','852%'); INSERT INTO tag VALUES (125,'853x','853%'); INSERT INTO tag VALUES (126,'854x','854%'); INSERT INTO tag VALUES (127,'855x','855%'); INSERT INTO tag VALUES (128,'857x','857%'); INSERT INTO tag VALUES (129,'858x','858%'); INSERT INTO tag VALUES (130,'859x','859%'); INSERT INTO tag VALUES (131,'journal','909C4%'); INSERT INTO tag VALUES (132,'collaboration','710__g'); INSERT INTO tag VALUES (133,'first author affiliation','100__u'); INSERT INTO tag VALUES (134,'additional author affiliation','700__u'); INSERT INTO idxINDEX VALUES (1,'global','This index contains words/phrases from global fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (2,'collection','This index contains words/phrases from collection identifiers fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (3,'abstract','This index contains words/phrases from abstract fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (4,'author','This index contains fuzzy words/phrases from author fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (5,'keyword','This index contains words/phrases from keyword fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (6,'reference','This index contains words/phrases from references fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (7,'reportnumber','This index contains words/phrases from report numbers fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (8,'title','This index contains words/phrases from title fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (9,'fulltext','This index contains words/phrases from fulltext fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (10,'year','This index contains words/phrases from year fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (11,'journal','This index contains words/phrases from journal publication information fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (12,'collaboration','This index contains words/phrases from collaboration name fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (13,'affiliation','This index contains words/phrases from institutional affiliation fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX VALUES (14,'exactauthor','This index contains exact words/phrases from author fields.','0000-00-00 00:00:00', ''); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (1,1); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (2,10); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (3,4); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (4,3); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (5,5); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (6,8); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (7,6); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (8,2); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (9,9); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (10,12); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (11,19); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (12,20); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (13,21); INSERT INTO idxINDEX_field (id_idxINDEX, id_field) VALUES (14,22); INSERT INTO sbmACTION VALUES ('Submit New Record','SBI','running','1998-08-17','2001-08-08','','Submit New Record'); INSERT INTO sbmACTION VALUES ('Modify Record','MBI','modify','1998-08-17','2001-11-07','','Modify Record'); INSERT INTO sbmACTION VALUES ('Submit New File','SRV','revise','0000-00-00','2001-11-07','','Submit New File'); INSERT INTO sbmACTION VALUES ('Approve Record','APP','approve','2001-11-08','2002-06-11','','Approve Record'); INSERT INTO sbmALLFUNCDESCR VALUES ('Ask_For_Record_Details_Confirmation',''); INSERT INTO sbmALLFUNCDESCR VALUES ('CaseEDS',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Create_Modify_Interface',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Create_Recid',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Finish_Submission',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Get_Info',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Get_Recid', 'This function gets the recid for a document with a given report-number (as stored in the global variable rn).'); INSERT INTO sbmALLFUNCDESCR VALUES ('Get_Report_Number',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Get_Sysno',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Insert_Modify_Record',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Insert_Record',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Is_Original_Submitter',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Is_Referee','This function checks whether the logged user is a referee for the current document'); INSERT INTO sbmALLFUNCDESCR VALUES ('Mail_Approval_Request_to_Referee',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Mail_Approval_Withdrawn_to_Referee',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Mail_Submitter',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Make_Modify_Record',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Make_Record',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Move_From_Pending',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Move_to_Done',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Move_to_Pending',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Print_Success',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Print_Success_Approval_Request',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Print_Success_APP',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Print_Success_DEL','Prepare a message for the user informing them that their record was successfully deleted.'); INSERT INTO sbmALLFUNCDESCR VALUES ('Print_Success_MBI',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Print_Success_SRV',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Register_Approval_Request',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Register_Referee_Decision',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Withdraw_Approval_Request',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Report_Number_Generation',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Second_Report_Number_Generation','Generate a secondary report number for a document.'); INSERT INTO sbmALLFUNCDESCR VALUES ('Send_Approval_Request',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Send_APP_Mail',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Send_Delete_Mail',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Send_Modify_Mail',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Send_SRV_Mail',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('Stamp_Replace_Single_File_Approval','Stamp a single file when a document is approved.'); INSERT INTO sbmALLFUNCDESCR VALUES ('Stamp_Uploaded_Files','Stamp some of the files that were uploaded during a submission.'); INSERT INTO sbmALLFUNCDESCR VALUES ('Test_Status',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Update_Approval_DB',NULL); INSERT INTO sbmALLFUNCDESCR VALUES ('User_is_Record_Owner_or_Curator','Check if user is owner or special editor of a record'); INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Files_to_Storage','Attach files received from chosen file input element(s)'); INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Revised_Files_to_Storage','Revise files initially uploaded with "Move_Files_to_Storage"'); INSERT INTO sbmALLFUNCDESCR VALUES ('Make_Dummy_MARC_XML_Record',''); INSERT INTO sbmALLFUNCDESCR VALUES ('Move_FCKeditor_Files_to_Storage','Transfer files attached to the record with the FCKeditor'); INSERT INTO sbmALLFUNCDESCR VALUES ('Create_Upload_Files_Interface','Display generic interface to add/revise/delete files. To be used before function "Move_Uploaded_Files_to_Storage"'); INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Uploaded_Files_to_Storage','Attach files uploaded with "Create_Upload_Files_Interface"'); INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Photos_to_Storage','Attach/edit the pictures uploaded with the "create_photos_manager_interface()" function'); INSERT INTO sbmFIELDDESC VALUES ('Upload_Photos',NULL,'','R',NULL,NULL,NULL,NULL,NULL,'\"\"\"\r\nThis is an example of element that creates a photos upload interface.\r\nClone it, customize it and integrate it into your submission. Then add function \r\n\'Move_Photos_to_Storage\' to your submission functions list, in order for files \r\nuploaded with this interface to be attached to the record. More information in \r\nthe WebSubmit admin guide.\r\n\"\"\"\r\n\r\nfrom invenio.websubmit_functions.ParamFile import ParamFromFile\r\nfrom invenio.websubmit_functions.Move_Photos_to_Storage import \\\r\n read_param_file, \\\r\n create_photos_manager_interface, \\\r\n get_session_id\r\n\r\n# Retrieve session id\r\ntry:\r\n # User info is defined only in MBI/MPI actions...\r\n session_id = get_session_id(None, uid, user_info) \r\nexcept:\r\n session_id = get_session_id(req, uid, {})\r\n\r\n# Retrieve context\r\nindir = curdir.split(\'/\')[-3]\r\ndoctype = curdir.split(\'/\')[-2]\r\naccess = curdir.split(\'/\')[-1]\r\n\r\n# Get the record ID, if any\r\nsysno = ParamFromFile(\"%s/%s\" % (curdir,\'SN\')).strip()\r\n\r\n\"\"\"\r\nModify below the configuration of the photos manager interface.\r\nNote: `can_reorder_photos\' parameter is not yet fully taken into consideration\r\n\r\nDocumentation of the function is available at \r\n\"\"\"\r\ntext += create_photos_manager_interface(sysno, session_id, uid,\r\n doctype, indir, curdir, access,\r\n can_delete_photos=True,\r\n can_reorder_photos=True,\r\n can_upload_photos=True,\r\n editor_width=700,\r\n editor_height=400,\r\n initial_slider_value=100,\r\n max_slider_value=200,\r\n min_slider_value=80)','0000-00-00','0000-00-00',NULL,NULL,0); INSERT INTO sbmCHECKS VALUES ('AUCheck','function AUCheck(txt) {\r\n var res=1;\r\n tmp=txt.indexOf(\"\\015\");\r\n while (tmp != -1) {\r\n left=txt.substring(0,tmp);\r\n right=txt.substring(tmp+2,txt.length);\r\n txt=left + \"\\012\" + right;\r\n tmp=txt.indexOf(\"\\015\");\r\n }\r\n tmp=txt.indexOf(\"\\012\");\r\n if (tmp==-1){\r\n line=txt;\r\n txt=\'\';}\r\n else{\r\n line=txt.substring(0,tmp);\r\n txt=txt.substring(tmp+1,txt.length);}\r\n while (line != \"\"){\r\n coma=line.indexOf(\",\");\r\n left=line.substring(0,coma);\r\n right=line.substring(coma+1,line.length);\r\n coma2=right.indexOf(\",\");\r\n space=right.indexOf(\" \");\r\n if ((coma==-1)||(left==\"\")||(right==\"\")||(space!=0)||(coma2!=-1)){\r\n res=0;\r\n error_log=line;\r\n }\r\n tmp=txt.indexOf(\"\\012\");\r\n if (tmp==-1){\r\n line=txt;\r\n txt=\'\';}\r\n else{\r\n line=txt.substring(0,tmp-1);\r\n txt=txt.substring(tmp+1,txt.length);}\r\n }\r\n if (res == 0){\r\n alert(\"This author name cannot be managed \\: \\012\\012\" + error_log + \" \\012\\012It is not in the required format!\\012Put one author per line and a comma (,) between the name and the firstname initial letters. \\012The name is going first, followed by the firstname initial letters.\\012Do not forget the whitespace after the comma!!!\\012\\012Example \\: Put\\012\\012Le Meur, J Y \\012Baron, T \\012\\012for\\012\\012Le Meur Jean-Yves & Baron Thomas.\");\r\n return 0;\r\n } \r\n return 1; \r\n}','1998-08-18','0000-00-00','',''); INSERT INTO sbmCHECKS VALUES ('DatCheckNew','function DatCheckNew(txt) {\r\n var res=1;\r\n if (txt.length != 10){res=0;}\r\n if (txt.indexOf(\"/\") != 2){res=0;}\r\n if (txt.lastIndexOf(\"/\") != 5){res=0;}\r\n tmp=parseInt(txt.substring(0,2),10);\r\n if ((tmp > 31)||(tmp < 1)||(isNaN(tmp))){res=0;}\r\n tmp=parseInt(txt.substring(3,5),10);\r\n if ((tmp > 12)||(tmp < 1)||(isNaN(tmp))){res=0;}\r\n tmp=parseInt(txt.substring(6,10),10);\r\n if ((tmp < 1)||(isNaN(tmp))){res=0;}\r\n if (txt.length == 0){res=1;}\r\n if (res == 0){\r\n alert(\"Please enter a correct Date \\012Format: dd/mm/yyyy\");\r\n return 0;\r\n }\r\n return 1; \r\n}','0000-00-00','0000-00-00','',''); INSERT INTO sbmFIELDDESC VALUES ('Upload_Files',NULL,'','R',NULL,NULL,NULL,NULL,NULL,'\"\"\"\r\nThis is an example of element that creates a file upload interface.\r\nClone it, customize it and integrate it into your submission. Then add function \r\n\'Move_Uploaded_Files_to_Storage\' to your submission functions list, in order for files \r\nuploaded with this interface to be attached to the record. More information in \r\nthe WebSubmit admin guide.\r\n\"\"\"\r\nimport os\r\nfrom invenio.websubmit_managedocfiles import create_file_upload_interface\r\nfrom invenio.websubmit_functions.Shared_Functions import ParamFromFile\r\n\r\nindir = ParamFromFile(os.path.join(curdir, \'indir\'))\r\ndoctype = ParamFromFile(os.path.join(curdir, \'doctype\'))\r\naccess = ParamFromFile(os.path.join(curdir, \'access\'))\r\ntry:\r\n sysno = int(ParamFromFile(os.path.join(curdir, \'SN\')).strip())\r\nexcept:\r\n sysno = -1\r\nln = ParamFromFile(os.path.join(curdir, \'ln\'))\r\n\r\n\"\"\"\r\nRun the following to get the list of parameters of function \'create_file_upload_interface\':\r\necho -e \'from invenio.websubmit_managedocfiles import create_file_upload_interface as f\\nprint f.__doc__\' | python\r\n\"\"\"\r\ntext = create_file_upload_interface(recid=sysno,\r\n print_outside_form_tag=False,\r\n include_headers=True,\r\n ln=ln,\r\n doctypes_and_desc=[(\'main\',\'Main document\'),\r\n (\'additional\',\'Figure, schema, etc.\')],\r\n can_revise_doctypes=[\'*\'],\r\n can_describe_doctypes=[\'main\'],\r\n can_delete_doctypes=[\'additional\'],\r\n can_rename_doctypes=[\'main\'],\r\n sbm_indir=indir, sbm_doctype=doctype, sbm_access=access)[1]\r\n','0000-00-00','0000-00-00',NULL,NULL,0); INSERT INTO sbmFORMATEXTENSION VALUES ('WORD','.doc'); INSERT INTO sbmFORMATEXTENSION VALUES ('PostScript','.ps'); INSERT INTO sbmFORMATEXTENSION VALUES ('PDF','.pdf'); INSERT INTO sbmFORMATEXTENSION VALUES ('JPEG','.jpg'); INSERT INTO sbmFORMATEXTENSION VALUES ('JPEG','.jpeg'); INSERT INTO sbmFORMATEXTENSION VALUES ('GIF','.gif'); INSERT INTO sbmFORMATEXTENSION VALUES ('PPT','.ppt'); INSERT INTO sbmFORMATEXTENSION VALUES ('HTML','.htm'); INSERT INTO sbmFORMATEXTENSION VALUES ('HTML','.html'); INSERT INTO sbmFORMATEXTENSION VALUES ('Latex','.tex'); INSERT INTO sbmFORMATEXTENSION VALUES ('Compressed PostScript','.ps.gz'); INSERT INTO sbmFORMATEXTENSION VALUES ('Tarred Tex (.tar)','.tar'); INSERT INTO sbmFORMATEXTENSION VALUES ('Text','.txt'); INSERT INTO sbmFUNDESC VALUES ('Get_Report_Number','edsrn'); INSERT INTO sbmFUNDESC VALUES ('Send_Modify_Mail','addressesMBI'); INSERT INTO sbmFUNDESC VALUES ('Send_Modify_Mail','sourceDoc'); INSERT INTO sbmFUNDESC VALUES ('Register_Approval_Request','categ_file_appreq'); INSERT INTO sbmFUNDESC VALUES ('Register_Approval_Request','categ_rnseek_appreq'); INSERT INTO sbmFUNDESC VALUES ('Register_Approval_Request','note_file_appreq'); INSERT INTO sbmFUNDESC VALUES ('Register_Referee_Decision','decision_file'); INSERT INTO sbmFUNDESC VALUES ('Withdraw_Approval_Request','categ_file_withd'); INSERT INTO sbmFUNDESC VALUES ('Withdraw_Approval_Request','categ_rnseek_withd'); INSERT INTO sbmFUNDESC VALUES ('Report_Number_Generation','edsrn'); INSERT INTO sbmFUNDESC VALUES ('Report_Number_Generation','autorngen'); INSERT INTO sbmFUNDESC VALUES ('Report_Number_Generation','rnin'); INSERT INTO sbmFUNDESC VALUES ('Report_Number_Generation','counterpath'); INSERT INTO sbmFUNDESC VALUES ('Report_Number_Generation','rnformat'); INSERT INTO sbmFUNDESC VALUES ('Report_Number_Generation','yeargen'); INSERT INTO sbmFUNDESC VALUES ('Report_Number_Generation','nblength'); INSERT INTO sbmFUNDESC VALUES ('Mail_Approval_Request_to_Referee','categ_file_appreq'); INSERT INTO sbmFUNDESC VALUES ('Mail_Approval_Request_to_Referee','categ_rnseek_appreq'); INSERT INTO sbmFUNDESC VALUES ('Mail_Approval_Request_to_Referee','edsrn'); INSERT INTO sbmFUNDESC VALUES ('Mail_Approval_Withdrawn_to_Referee','categ_file_withd'); INSERT INTO sbmFUNDESC VALUES ('Mail_Approval_Withdrawn_to_Referee','categ_rnseek_withd'); INSERT INTO sbmFUNDESC VALUES ('Mail_Submitter','authorfile'); INSERT INTO sbmFUNDESC VALUES ('Mail_Submitter','status'); INSERT INTO sbmFUNDESC VALUES ('Send_Approval_Request','authorfile'); INSERT INTO sbmFUNDESC VALUES ('Create_Modify_Interface','fieldnameMBI'); INSERT INTO sbmFUNDESC VALUES ('Send_Modify_Mail','fieldnameMBI'); INSERT INTO sbmFUNDESC VALUES ('Update_Approval_DB','categformatDAM'); INSERT INTO sbmFUNDESC VALUES ('Update_Approval_DB','decision_file'); INSERT INTO sbmFUNDESC VALUES ('Send_SRV_Mail','categformatDAM'); INSERT INTO sbmFUNDESC VALUES ('Send_SRV_Mail','addressesSRV'); INSERT INTO sbmFUNDESC VALUES ('Send_Approval_Request','directory'); INSERT INTO sbmFUNDESC VALUES ('Send_Approval_Request','categformatDAM'); INSERT INTO sbmFUNDESC VALUES ('Send_Approval_Request','addressesDAM'); INSERT INTO sbmFUNDESC VALUES ('Send_Approval_Request','titleFile'); INSERT INTO sbmFUNDESC VALUES ('Send_APP_Mail','edsrn'); INSERT INTO sbmFUNDESC VALUES ('Mail_Submitter','titleFile'); INSERT INTO sbmFUNDESC VALUES ('Send_Modify_Mail','emailFile'); INSERT INTO sbmFUNDESC VALUES ('Get_Info','authorFile'); INSERT INTO sbmFUNDESC VALUES ('Get_Info','emailFile'); INSERT INTO sbmFUNDESC VALUES ('Get_Info','titleFile'); INSERT INTO sbmFUNDESC VALUES ('Make_Modify_Record','modifyTemplate'); INSERT INTO sbmFUNDESC VALUES ('Send_APP_Mail','addressesAPP'); INSERT INTO sbmFUNDESC VALUES ('Send_APP_Mail','categformatAPP'); INSERT INTO sbmFUNDESC VALUES ('Send_APP_Mail','newrnin'); INSERT INTO sbmFUNDESC VALUES ('Send_APP_Mail','decision_file'); INSERT INTO sbmFUNDESC VALUES ('Send_APP_Mail','comments_file'); INSERT INTO sbmFUNDESC VALUES ('CaseEDS','casevariable'); INSERT INTO sbmFUNDESC VALUES ('CaseEDS','casevalues'); INSERT INTO sbmFUNDESC VALUES ('CaseEDS','casesteps'); INSERT INTO sbmFUNDESC VALUES ('CaseEDS','casedefault'); INSERT INTO sbmFUNDESC VALUES ('Send_SRV_Mail','noteFile'); INSERT INTO sbmFUNDESC VALUES ('Send_SRV_Mail','emailFile'); INSERT INTO sbmFUNDESC VALUES ('Mail_Submitter','emailFile'); INSERT INTO sbmFUNDESC VALUES ('Mail_Submitter','edsrn'); INSERT INTO sbmFUNDESC VALUES ('Mail_Submitter','newrnin'); INSERT INTO sbmFUNDESC VALUES ('Make_Record','sourceTemplate'); INSERT INTO sbmFUNDESC VALUES ('Make_Record','createTemplate'); INSERT INTO sbmFUNDESC VALUES ('Print_Success','edsrn'); INSERT INTO sbmFUNDESC VALUES ('Print_Success','newrnin'); INSERT INTO sbmFUNDESC VALUES ('Print_Success','status'); INSERT INTO sbmFUNDESC VALUES ('Make_Modify_Record','sourceTemplate'); INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','documenttype'); INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','iconsize'); INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','paths_and_suffixes'); INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','rename'); INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','paths_and_restrictions'); INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','elementNameToDoctype'); INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','createIconDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','createRelatedFormats'); INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','iconsize'); INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','keepPreviousVersionDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','files_to_be_stamped'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','latex_template'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','latex_template_vars'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','stamp'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','layer'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','switch_file'); INSERT INTO sbmFUNDESC VALUES ('Make_Dummy_MARC_XML_Record','dummyrec_source_tpl'); INSERT INTO sbmFUNDESC VALUES ('Make_Dummy_MARC_XML_Record','dummyrec_create_tpl'); INSERT INTO sbmFUNDESC VALUES ('Print_Success_APP','decision_file'); INSERT INTO sbmFUNDESC VALUES ('Print_Success_APP','newrnin'); INSERT INTO sbmFUNDESC VALUES ('Send_Delete_Mail','edsrn'); INSERT INTO sbmFUNDESC VALUES ('Send_Delete_Mail','record_managers'); INSERT INTO sbmFUNDESC VALUES ('Second_Report_Number_Generation','2nd_rn_file'); INSERT INTO sbmFUNDESC VALUES ('Second_Report_Number_Generation','2nd_rn_format'); INSERT INTO sbmFUNDESC VALUES ('Second_Report_Number_Generation','2nd_rn_yeargen'); INSERT INTO sbmFUNDESC VALUES ('Second_Report_Number_Generation','2nd_rncateg_file'); INSERT INTO sbmFUNDESC VALUES ('Second_Report_Number_Generation','2nd_counterpath'); INSERT INTO sbmFUNDESC VALUES ('Second_Report_Number_Generation','2nd_nb_length'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','file_to_be_stamped'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','latex_template'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','latex_template_vars'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','new_file_name'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','stamp'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','layer'); INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','switch_file'); INSERT INTO sbmFUNDESC VALUES ('Move_FCKeditor_Files_to_Storage','input_fields'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','maxsize'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','minsize'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','doctypes'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','restrictions'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canDeleteDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canReviseDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canDescribeDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canCommentDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canKeepDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canAddFormatDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canRestrictDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canRenameDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canNameNewFiles'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','createRelatedFormats'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','keepDefault'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','showLinks'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','fileLabel'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','filenameLabel'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','descriptionLabel'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','commentLabel'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','restrictionLabel'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','startDoc'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','endDoc'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','defaultFilenameDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','maxFilesDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','iconsize'); INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','createIconDoctypes'); INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','forceFileRevision'); INSERT INTO sbmFUNDESC VALUES ('Move_Photos_to_Storage','iconsize'); INSERT INTO sbmGFILERESULT VALUES ('HTML','HTML document'); INSERT INTO sbmGFILERESULT VALUES ('WORD','data'); INSERT INTO sbmGFILERESULT VALUES ('PDF','PDF document'); INSERT INTO sbmGFILERESULT VALUES ('PostScript','PostScript document'); INSERT INTO sbmGFILERESULT VALUES ('PostScript','data '); INSERT INTO sbmGFILERESULT VALUES ('PostScript','HP Printer Job Language data'); INSERT INTO sbmGFILERESULT VALUES ('jpg','JPEG image'); INSERT INTO sbmGFILERESULT VALUES ('Compressed PostScript','gzip compressed data'); INSERT INTO sbmGFILERESULT VALUES ('Tarred Tex (.tar)','tar archive'); INSERT INTO sbmGFILERESULT VALUES ('JPEG','JPEG image'); INSERT INTO sbmGFILERESULT VALUES ('GIF','GIF'); INSERT INTO collectiondetailedrecordpagetabs VALUES (8, 'usage;comments;metadata'); INSERT INTO collectiondetailedrecordpagetabs VALUES (19, 'usage;comments;metadata'); INSERT INTO collectiondetailedrecordpagetabs VALUES (18, 'usage;comments;metadata'); INSERT INTO collectiondetailedrecordpagetabs VALUES (17, 'usage;comments;metadata'); -- end of file diff --git a/modules/websearch/doc/search-guide.webdoc b/modules/websearch/doc/search-guide.webdoc index 55da7faa4..f55011b80 100644 --- a/modules/websearch/doc/search-guide.webdoc +++ b/modules/websearch/doc/search-guide.webdoc @@ -1,5289 +1,5378 @@ ## -*- mode: html; coding: utf-8; -*- ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

Our search engine tries to offer today's typical web searching experience, as gained with popular search engines such as Google. The nature of bibliographic searching differs from that of a web page searching, though. We provide many extensions to enable a complex and precise structured search, including an combined metadata, fulltext and reference search in one go. This page lists several tips and tricks that you may find useful to this effect.

Notre moteur de recherche propose des fonctionnalités similaires à celles des moteurs de recherche actuellement disponibles sur le Web, tels que Google. La nature des recherches bibliographiques diffère cependant de la recherche de pages web. Nous proposons donc des extensions permettant des recherches structurées complexes et précises, dont une recherche combinée s'effectuant sur les metadonnées, le contenu des fichiers et les références de chaque notice. Cette page répertorie les trucs et astuces pouvant vous être utiles.

Unsere Suchmaschine bietet den heutigen Stand der Web-Such Technologie, die auch von bekannten Suchmaschinen wie zum Beispiel Google angeboten werden. Im Detail unterscheidet sich jedoch die bibliographische Suche von einer Web-Suche. Wir bieten mehrere Erweiterungen an, damit eine komplexe und genau strukturierte Suche möglich wird, inklusiv einer kombinierten Metadatensuche, Volltextsuche und Referenzsuche. Diese Seite stellt Tipps und Tricks vor, die für eine effektive Suche nützlich sind.

El motor de búsqueda de este sistema trata de ofrecer la tecnología más actual de búsqueda web, desarrollada por buscadores tan populares como Google . Sin embargo, la naturaleza de una búsqueda bibliográfica difiere considerablemente de la de una página Web. La alternativa propuesta es la de proporcionar numerosas extensiones que hagan posible búsquedas de estructura compleja y precisa, incluso combinando metadatos, texto completo y citas bibliográficas en una misma consulta. Esta página ofrece una serie consejos útiles para conseguir una búsqueda más eficaz.

El motor de cerca d'aquest sistema tracta d'oferir la tecnologia més actual de cerca web, desenvolupada per cercadors tant populars com Google . La naturalesa d'una cerca bibliogràfica, però, difereix considerablement de la d'una página Web. L'alternativa proposada és la de proporcionar nombroses extensions que permetin cerques d'estructura complexa i precisa, inclosa la combinació de metadades, text complet i referències bibliogràfiques a una mateixa cerca. Aquesta pàgina ofereix una sèrie de consells útils per aconseguir una cerca més eficaç.

    Simple versus advanced search
    Search guidance
    Searching for words versus phrases
    Boolean queries
    Parentheses
    Special characters and punctuation
    International characters
    Word truncation/stemming
    Structured metadata search
    Regular expressions
    Span queries +
    Refersto/citedby search operators
    Combined metadata/fulltext/citation search
    Frequently asked questions
        How to wisely choose your search terms (speed-wise)
        How to search for publications by a given author
        How to sort according to a certain pattern
        How to get documents from other servers (Google, SPIRES, KEK)
        How to search in fulltext files
        How to search for citations

    Recherche simple versus recherche avancée
    Aide à la recherche
    Recherche de mots versus recherche de phrases
    Requêtes booléennes
    Parenthèses
    Caractères spéciaux et ponctuation
    Caractères internationaux
    Troncature des mots/indexation par radicaux
    Recherche structurée
    Expressions régulières
    Requêtes de plages +
    Refersto/citedby search operators
    Recherche combinée métadonnées/fulltext/citation
    Foire aux questions
        Comment sélectionner vos termes de recherche de manière intelligente (en termes de vitesse)
        Comment rechercher les publications d'un auteur donné
        Comment trier d'après un certain critère
        Comment obtenir les documents d'autres serveurs (Google, SPIRES, KEK)
        Comment rechercher le contenu des fichiers
        Comment rechercher les citations

    Einfache versus erweiterte Suche
    Grundlagen
    Suche nach Wörtern und Wortgruppen
    Boolsche Suche
    Parentheses
    Spezielle Zeichen und Notation
    Internationale Zeichen
    Trunkierung
    Strukturierte Metadatensuche
    Regular expressions
    Bereichs-Recherche +
    Referfsto/citedby search operators
    Kombinierte Metadaten-/Volltext-/Zitatsuche
    FAQ
        Wie wähle ich am geschicktesten meinen Suchbegriff
        Wie suche ich nach Publikationen eines bestimmten Autors
        Wie lasse ich Ergebnisse auf eine bestimmte Weise sortieren
        Wie bekomme ich Dokumente anderer Server (Google, SPIRES, KEK)
        Wie kann ich in verknüpften Volltextdateien suchen
        Wie kann ich nach Zitaten suchen

    Búsqueda simple versus avanzada
    Búsqueda guiada
    Búsqueda por palabras versus búsqueda por frases
    Consultas booleanas
    Parentheses
    Caracteres especiales y puntuación
    Caracteres internacionales
    Truncamientos y búsquedas por raíz
    Búsqueda por metadatos estructurados
    Expresiones regulares
    Consultas por rango +
    Refersto/citedby search operators
    Combinación de metadatos/texto completo/cita bibliográfica
    Preguntas frecuentes
        Elegir correctamente los términos de búsqueda (speed-wise)
        ¿Cómo localizar publicaciones a partir del autor?
        ¿Cómo ordenar acorde a cierto patrón de ordenación?
        ¿Cómo obtener documentos de otros servidores? (Google, SPIRES, KEK)
        ¿Cómo buscar en ficheros a texto completo?
        ¿Cómo buscar citas bibliogáficas?

    Cerca simple versus avançada
    Cerca guiada
    Cerca per paraules versus cerca per frases
    Consultes booleanes
    Parentheses
    Caracters especials i puntuació
    Caracters internacionals
    Truncaments i cercques per arrel
    Cerca por metadadess estructurades
    Expressions regulars
    Consultes per rang +
    Refersto/citedby search operators
    Combinació de metadades/text complet/referència bibliogràfica
    Preguntes freqüents
        Escollir correctament els térmes de cerca (speed-wise)
        Com localitzar publicacions a partir de l'autor?
        Com ordenar d'acord a un patró d'ordenació?
        Com obtenir documents d'altres servidors? (Google, SPIRES, KEK)
        Cómo cercar a fitxers a text complet?
        Cómo cercar referències bibliogàfiques?

Simple versus advanced search Recherche simple versus recherche avancée Einfache versus erweiterte Suche Búsqueda simple versus avanzada Cerca simple versus avançada

The default search mode is simple search that basically provides you with one input box where you can type your query, followed by a possibility to choose one of the common indexes to search within. You would usually simply type the keywords you are interested in and hit return. For example, if you are interested in documents on standard model that are written by (or mention) Ellis, you would type:

and on the search results page you could further add/remove keywords to get more precisely at what you are looking for, as is mentioned below.

Le mode de recherche par défaut est la recherche simple, qui consiste principalement en un seul champ dans lequel vous pouvez saisir votre requête. Vous avez également la possibilité de restreindre la recherche à un champ spécifique. Dans la majorité des cas, vous n'aurez qu'à saisir les mots-clés qui vous intéressent et lancer la recherche. Si, par exemple, vous souhaitez trouver des documents relatifs au standard model écrits (ou mentionnés) par Ellis, votre requête doit être la suivante:

Depuis la page des résultats vous pourrez également ajouter/supprimer des mots-clefs afin de parfaire votre recherche, comme expliqué plus loin.

El modo de búsqueda por defecto es la búsqueda simple que permite, en términos generales, escribir una consulta en el campo de texto, acompañado de la posibilidad de seleccionar uno de los índices comunes sobre los que efectuar la consulta. Se introducen las palabras clave a buscar y se acciona la consulta mediante la tecla de retorno (o intro). Por ejemplo, si interesa recuperar documentos sobre modelos estándar que han sido escritos por (o que mencionan a) Ellis, debemos escribir:

y en la página de resultados de la búsqueda podemos volver a añadir o quitar palabras clave para obtener mayor precisión en los resultados, tal y como se especifica en más adelante.

La cerca per defecte és la cerca simple, que fonamentalment permet escriure la consulta dins un camp de text, seguit de la possibilitat d'escollir un dels índexs comuns sobre els que realitzar la consulta. S'introdueixen les paraules clau a buscar i s'acciona la consulta mitjançant la tecla de retorn (intro). Per exemple, si interessa recuperar documents sobre models estàndard que han estat escrits per (o que fan esment a) Ellis, haurem d'escriure:

i a la pàgina dels resultats de la cerca podem tornar a afegir o treure paraules clau per tal d'obtenir més precisió als resultats, tal i com s'especifica més endavant.

The advanced search interface provides you with explicit tools to play with: you can change the matching type from the default word matching to phrase searching or the regular matching; you can use boolean queries in several indexes, etc. For example, to find all the documents written by Ellis, J spelled exactly that way that contain either of the words muon or neutrino in the title and that were published in 2001, you would type:

 

Note that Simple Search can provide you basically the same functionality, if you make use of special syntax that is explained in the text below. The simple-versus-advanced does not refer to the functionality that is being provided but rather to the amount of parametrization you can "tweak". We conform to the common use of the simple/advanced terms as found in other search engines.

Much of what follows will deal with a question on "how a power user would use the simple search interface". Recall that you can always go to the Advanced Search for more query assistance.

L'interface de recherche avancée vous offre plus de contrôles: vous pouvez changer le type de correspondance (exacte, expression régulière, phrase partielle); vous pouvez effectuer des recherches booléennes dans différents index, etc. Par exemple pour trouver tous les documents écrits par Ellis, J (écrit exactement ainsi), qui contiennent les mots muon ou neutrino dans le titre et qui ont été publiés en 2001, saisissez:

 

Notez que la recherche simple peut obtenir le même résultat que la recherche avancée si vous utilisez la syntaxe expliquée plus bas. La différence entre la recherche simple et la recherche avancée ne se fait pas sur les fonctionnalités offertes mais plutôt sur le nombre de paramètres qu'il vous est possible de combiner. Nous nous conformons en ce sens aux modèles de recherches simples/avancées que l'on trouve dans d'autre moteurs de recherche.

Ce qui suit explique principalement la syntaxe qu'un utilisateur expérimenté peut utiliser dans le mode de recherche simple. Rappelez-vous que vous pouvez toujours utiliser la recherche avancée pour parvenir à des résultats ciblés sans avoir à utiliser de syntaxe particulière.

El interfaz de búsqueda avanzada ofrece herramientas específicas con las que configurar la búsqueda: podemos cambiar el tipo de búsqueda por defecto (de alguna palabra a todas, buscar por frase exacta, por expresión regular, etc.); nos permite utilizar las expresiones booleanas combinando varios índices, etc. Por ejemplo, para recuperar todos los documentos escritos por Ellis, J -transcrito exactamente de ese modo- y también las palabras muons o neutrino en el título de la publicación y 2001 en el año, escribiremos lo siguiente:

 

Obsérvese que si utilizamos una sintaxis especial, la Búsqueda Simple ofrece prácticamente la misma funcionalidad que la avanzada, tal y como se indica en las siguientes secciones de esta guía. Este apartado de búsqueda simple-versus-avanzada no se refiere tanto a las funcionalidades disponibles, sino a la forma de búsqueda que podemos configurar mediante pequeñas modificaciones. El sistema se ha adaptado al uso común de los términos de simple/avanzado que podemos encontrar en otros motores de búsqueda.

En realidad, gran parte de la explicación que sigue a continuación, responde a la cuestión de "cómo un usuario avanzado puede utilizar la interficie de búsqueda simple". Recordamos que en cualquier momento es posible recurrir a la ayuda de la búsqueda avanzada para conseguir más asistencia.

La interficie de cerca avançada ofereix eines específiques amb les que configurar la cerca: podem canviar el tipus de cerca a realitzar per defecte (d'alguna paraula a totes les paraules, cerca per frase exacta, per expressió regular, etc.); ens permet emprar la combinació booleana a nombrosos índexs, etc. Per exemple, per localitzar tots els documents escrits per Ellis, J -transcrit exactament d'aquesta manera- i també les paraules muon o neutrino al títol de la publicació i 2001 a l'any, escriurem:

 

Observis que si fem servir una sintaxi especial, la Cerca Simple ofereix pràcticament la mateixa funcionalitat que l'avançada, tal i com s'explica a les següents seccions d'aquesta guia. Aquest apartat de cerca simple-versus-avançada no es refereix tant a les funcionalitats disponibles, com a la forma de cerca que podem configurar mitjançant petites modificacions. El sistema s'ha adaptat a l'ús comú dels termes simple/avançat que podem trobar a altres motors de cerca.

En realitat, gran part de l'explicació que segueix a continuació, respòn a la qüestió de "com un usuari avançat pot utilitzar la interficie de cerca simple". Recordem que en qualsevol moment és posible recòrrer a l'ajuda de la cerca avanzada per aconseguir més assistència.

Search guidance Aide à la recherche Grundlagen Búsqueda guiada Cerca guiada

After you submit your query, the search engine will analyze it and will try to always guide you in case no exact match could be found. For example, it would print you a list of closest indexed terms in case of spelling troubles:

Alternative choices will be printed in red. The search engine will similarly warn you when your search terms could not be found, or when they could but your boolean query couldn't be met. The search engine will also silently try to search for alternative forms (e.g. remove punctuation), etc.

Thanks to multiple search stages and the guidance provided at each stage, it is usually sufficient to simple type what you are looking for and see what the system says in return. If you aren't satisfied, you would then add/remove words from your query until the satisfactory reply.

Une fois votre requête envoyée, le moteur de recherche l'analyse et tente de vous aider si aucune correspondance exacte ne peut être trouvée. Il affiche, par exemple, une liste des termes indexés les plus proches en cas d'orthographe approximative:

Les alternatives sont affichées en rouge. Similairement, le moteur de recherche vous avertit lorsque votre requête ne peut aboutir, ou lorsque les éléments de recherche existent mais ne correspondent pas à votre requête booléenne. Le moteur de recherche essaie également de trouver des formes alternatives (suppression de la ponctuation, etc.) en arrière-plan.

Grâce aux multiples étapes de la recherche et à l'assistance fournie à chaque stade, il suffit généralement de saisir les éléments recherchés et de les adapter ensuite en fonction des réponses données par le système. Si vous n'êtes pas satisfait du résultat, il vous est alors possible de corriger votre requête petit à petit.

Gracias al sistema de búsqueda en múltiples etapas y a la ayuda que se proporciona en cada una de estas etapas, por lo general resulta suficiente escribir la palabra de búsqueda y revisar los resultados qué nos devuelve el sistema. Si los resultados no son pertinentes, o son excesivos o insuficientes, es posible añadir o quitar palabras de la consulta hasta conseguir resultados más satisfactorios.

Después de recibir una consulta, el motor de búsqueda la analiza y, en caso de no localizar ninguna coincidencia en la base de datos, el sistema tratará de guiar al usuario. Un ejemplo de ello son las listas de los términos indexados que se encuentran más cercanos al término introducido, en caso de que se produzacan errores ortográficos o de transcripción al efectuar la consulta:

Las opciones alternativas se muestran en rojo. El motor de búsqueda avisa de forma similar cuando detecta que los términos de búsqueda no se encuentran, o cuando detecta que sí que existen pero la fórmula de consulta booleana utilizada no puede localizarlos. Así mismo, el motor de búsqueda también realiza de forma oculta acciones alternativas para mejorar los resultados, como por ejemplo eliminar puntuación, etc.

Gràcies al sistema de cerca en múltiples etapes i a l'ajuda que es va proporcionant a cadascuna d'aquestes etapes, pel general resulta suficient teclejar l'objetcte de cerca i observar que ens retorna el sistema. Si no quedem satisfets, podem afegir/treure paraules de la nostra consulta fins aconseguir uns resultats satisfactoris.

Després de rebre una consulta, el motor de cerca l'analizarà i tractarà sempre de guiar a l'usuari en cas de que no localitzi cap coincidència a la base de dades. Un exemple d'això son les llistes dels termes indexats que es troben més propers al terme introduït a la consulta en cas que no es trobin coincidències per errors ortogràfics o de transcripció:

Les opcions alternatives es mostren en vermell. El motor de cerca avisa de forma similar quan detecta que els termes de cerca no es troben, o quan detecta que si que existeixen però la fórmula de consulta booleana emprada no aconsegueix localitzar-los. Així mateix, el motor de cerca també realitza de forma oculta accions alternatives per a millorar els resultat,s com, per exemple, eliminar puntuació, etc.

Searching for words versus phrases Recherche de mots versus recherche de phrases Suche nach Wörtern und Wortgruppen Búsqueda por palabras versus búsqueda por frases Cerca per paraules versus cerca per frases

The default search mode is a search for words. This means that any whitespace you type is not significant, but is rather interpreted to mean "add an automatic boolean AND between words", like Google does. For example, to find all records that contain both the word ellis and the word muon anywhere in the record, type:

The whitespace would be significant if you include it within quotes. There are two phrase searching modes:
  1. The double quotes instruct the search engine to search for exact phrase. This phrase search mode will match if and only if the given metadata field is exactly equal to the input pattern. For example, to find all documents written by Ellis, J spelled exactly that way, type:
  2. The single quotes instruct the search engine to search for partial phrase. Unlike the exact phrase search, this mode allows for an extra text appearing before/after given pattern. This is somewhat similar to the "phrase search mode" common on Google and other fulltext engines that search for phrase expressions inside Web pages. For example, to find all the titles containing the expression muon decay regardless of the position of the expression in the title, type:
    Now you see how to search for an author spelled sometimes as Ellis, J and sometimes as Ellis, Jonathan Richard (and other authors, such as De Lellis, Jim) at the same time:
    (See also our specific author searching tips.)

The difference between exact and partial phrase searching modes may not be obvious upon first look. While the latter is more similar to what ``phrase search'' usually means in the context of web page search engines, the former one is usually an order of magnitude faster if you know the precise values you are looking for.

Another interesting searching mode besides the word and phrase searches is the regular expression search, introduced by slashes instead of quotes. For example, the above partial phrase query 'muon decay' is fully equivalent to the regular expression query /muon decay/. The regular expression syntax is very powerful and permits you to construct very complex queries. For more information, please consult the regular expression section of this guide.

Le mode de recherche par défaut est la recherche de mots. Cela signifie que les espaces entre les mots sont ignorés. Ils sont interpretés dans le sens "ajouter automatiquement un booléen AND (=ET) entre les mots", à la manière de Google. Par exemple, pour trouver toutes les notices qui contiennent à la fois le mot ellis et le mot muon, saisissez:

Les espaces sont significatifs si la requête est placée entre guillemets. Il existe également deux modes de recherche de phrases:
  1. Les guillemets doubles sont utilisés pour la recherche de phrases exactes. Ce mode de recherche ne trouvera de résultats que si, et seulement si une correspondance exacte est trouvée pour le champ donné. Par exemple, pour trouver tous les documents écrits par Ellis, J (écrit exactement ainsi), saisissez:
  2. Les guillemets simples sont utilisés pour la recherche de phrases partielles. Contrairement à la recherche de phrases exactes, ce mode permet de trouver une phrase même si celle-ci est contenue dans un texte plus grand. Ce mode s'apparente au mode de recherche commun à Google et aux autres moteurs cherchant des phrases à l'intérieur de pages Web. Par exemple, pour trouver tous les titres contenant l'expression muon decay sans tenir compte de sa position dans le titre, saisissez:
    Il est ainsi possible de rechercher un auteur dont le nom est parfois écrit Ellis, J et parfois Ellis, Jonathan Richard (et d'autres auteurs comme De Lellis, Jim):
    (Reportez-vous également la section dédiée à la recherche d'auteurs.)

La différence entre les modes de recherche de phrases exactes et de phrases partielles n'est pas forcément évidente au premier abord. Alors que le second mode correspond plutôt au mode de recherche tel qu'on l'entend dans le contexte de la recherche de pages Web, le premier mode est habituellement bien plus rapide si vous connaissez la valeur précise de ce que vous cherchez.

Il existe également un autre mode de recherche, celui de la recherche d'expressions régulières, introduit par des barres obliques ('/') à la place des guillemets. Par exemple, la requête 'muon decay' ci-dessus est totalement équivalente à la requête par expression régulière /muon decay/. La syntaxe des expressions régulières est très puissante et permet de construire des requêtes complexes. Pour plus d'informations, veuillez consulter la section de ce guide concernant les expressions régulières.

El modo de búsqueda por defecto es la búsqueda por palabras. Ello significa que se ignoran los espacios en blanco, pero también que el motor de búsqueda lo interpreta por defecto como "añade automáticamente una Y booleana entre las palabras introducidas", del mismo modo que hace Google. Por ejemplo, para recuperar todos los registros que contengan tanto la palabra ellis como muon en cualquier parte del registro, debemos escribir:

Si queremos que el espacio en blanco sea significativo, debemos escribir la sentencia entrecomillada. Existen dos formas de búsqueda por frases:
  1. La introducción de dobles comillas le indica al motor de búsqueda buscar la frase exacta. Esta forma de búsqueda por frase devolverá resultados sólo si los campos de metadatos localizados son exactamente iguales al patrón solicitado. Por ejemplo, para recuperar todos los documentos escritos por Ellis, J escrito exactamente de ese modo, debemos escribir:
  2. La introducción de comillas simples le indica al motor de búsqueda buscar alguna parte de la frase. A diferencia de la búsqueda de frase exacta este método permite que exista texto antes/después del patrón solicitado. Sería algo similar al método de "búsqueda por frase" que utilizan Google y otros motores de búsqueda cuando buscan expresiones dentro del texto completo de las páginas web. Por ejemplo, para localizar todos los títulos que contengan la expresión muon decay independientemente de la posición que ocupe la expresión en el título, escribiremos:
    Observemos como se introduciría la búsqueda de un autor que a veces aparece escrito como Ellis, J y otras como Ellis, Jonathan Richard (incluyendo otros autores, como De Lellis, Jim) en una misma acción de búsqueda:
    (Para más información, se recomienda ver también la búsqueda específica por autor.)

La diferencia entre buscar por frase exacta o buscar en parte de la frase puede parecer poco obvia en un primer momento. Sin embargo, mientras que la búsqueda en parte de la frase se asemeja a la búsqueda por frase que suele utilizarse en el contexto de los buscadores web, el uso de la búsqueda por frase exacta resulta mucho más efectiva y rápida cuanto más se conocen y especifican los valores a recuperar.

Otro interesante modo de búsqueda aparte de la búsqueda por palabra o por frase es el uso de la expresión regular de búsqueda, que se efectúa a partir del uso de parentesis en lugar de entrecomillado. Por ejemplo, la búsqueda anterior según el modo de en parte de la frase 'muon decay' es equivalente a la expresión regular/muon decay/. La sintaxis de las expresiones regulares es muy potente, y permite construir búsquedas muy complejas. Para obtener más información, recomendamos la consulta de la sección expresión regular de esta guia.

El mode de cerca per defecte es la cerca per paraules. Aquest mode implica s'ignorian els espais en blanc, però també que el motor de cerca els interpreti per defecte com a "afegeis automàticament una I booleana entre les paraules introduïdes", de la mateixa forma que fa Google. Per exemple, per recuperar tots els registres que continguin tant la paraula ellis com muon a qualsevol part del registre, haurem d'escriure:

Si volem que l'espai en blanc sigui significatiu, hem d'escriure la sentència entre cometes. Existeixen dues formes de cerca per frase:
  1. La introducció de cometes dobles indica al motor de cerca cerca la frase exacta. Aquesta forma de cerca per frase retornara resultats només en el cas que els camps de metadades localitzats siguin exactament igual al patró sol·licitat. Per exemple, per recuperar tots els documents escrits per Ellis, J escrit exactament d'aquesta manera, hem d'escriure:
  2. La introducció de comentes simples indica al motor de cerca l'acció de cercar a alguna part de la frase. A diferència de la cerca per frase exacta aquest mètode permet que existeixi text abans/desprès del patró sol•licitat. Aquest métode és similar al de "cerca per frase" que utilitzen Google i altres motors de cerca quan cerquen expressions dins del text complet de les pàgines web. Per exemple, per localitzar tots els títols que continguin l'expressió muon decay independentment de la posició que aquesta ocupi dins el títol, escriurem:
    Observem como s’introduiria la cerca d’un autor que de vegades apareix escrit com Ellis, J y altres com Ellis, Jonathan Richard (incloent altres autors com De Lellis, Jim) en una mateixa acció de cerca:
    (Per més informació, es recomana veure també la cerca específica per autor.)

La diferència entre cercar per frase exacta o cercar per part de la frase Pot semblar poc evident a primera vista. Ara bé, mentre que la cerca per part de la frase s’equipara a la cerca per frase que acostuma a aplicar-se dins el context dels cercadors web, l’ús de la cerca per frase exacta resulta molt més efectiva i ràpida com més es coneguin i especifiquin els valors a recuperar.

Un altra interessant forma de cerca apart de la cerca per paraula o frase, és l’ús de la expressió regular de cerca, que es realitza a partir del ús de barres inclinades en lloc de cometes. Per exemple, la cerca anterior segons el mode de ‘part de la frase’ de l’expressió 'muon decay' és equivalent a l’expressió regular/muon decay/. La sintaxi de les expressions regulars és molt potent, i permet construir cerques molt complexes. Per obtenir més informació, recomanem la consulta de la secció expressió regular d’aquesta guia.

Boolean queries Requêtes booléennes Boolsche Suche Búsquedas booleanas Cerques booleanes

We have already seen how whitespace adds a silent boolean AND in the search for words. The other boolean operators include:
+
AND
ellis +muon matches all records that contain both the word ellis and the the word muon
ellis muon ditto, syntactic sugar
ellis and muon ditto, syntactic sugar
-
NOT
ellis -muon matches all records that contain the word ellis but that do not contain the word muon
ellis not muon ditto, syntactic sugar
|
OR
ellis |muon matches all records that contain at least one of the words
ellis or muon ditto, syntactic sugar
Nous avons déjà vu comment les espaces ajoutaient silencieusement un ET booléen dans la recherche par mots. Les autres opérateurs booléens sont:
+
AND
ellis +muon (= ET) cherche toutes les notices contenant à la fois le mot ellis et le mot muon
ellis muon idem (simplification syntaxique)
ellis and muon idem (simplification syntaxique)
-
NOT
ellis -muon (= NON) cherche toutes les notices qui contiennent le mot ellis mais qui ne contiennent pas le mot muon
ellis not muon idem (simplification syntaxique)
|
OR
ellis |muon (= OU) cherche toutes les notices qui contiennent au moins l'un des deux mots.
ellis or muon idem (simplification syntaxique)
Ya hemos visto como la inclusión de un espacio en blanco es equivalente a la adición de un operador booleano en la búsqueda por palabras. El resto de operadores booleanos son:
+
AND
ellis +muon recupera todos los registros que contengan ambas palabras: ellis y la palabra muon
ellis muon ídem, "syntactic sugar"(*ver nota)
ellis and muon ídem, "syntactic sugar"
-
NOT
ellis -muon recupera todos los registros que contengan la palabra ellis pero que no contenga la palabra muon
ellis not muon ditto, "syntactic sugar"
|
OR
ellis |acelerador recupera todos los registros que contengan al menos, una de las dos palabras
ellis or muon ídem, "syntactic sugar"
(*)Nota de la traducción española: syntactic sugar es una espresión utilizada en inglés para describir "otra forma similar de llamar a una función"
Hem vist com la inclusió d’un espai en blanc és equivalent a l’adició d’un operador booleà a la cerca per paraules. La resta d’operadors booleans són:
+
AND
ellis +muon Recupera tots els registres que contenen ambdues paraules: ellis i la paraula muon
ellis muon ídem, "syntactic sugar" (*veure nota)
ellis and muon ídem, "syntactic sugar"
-
NOT
ellis -muon recupera tots els registres que contenen la paraula ellis però que no contenen la paraula muon
ellis not muon ídem, "syntactic sugar"
|
OR
ellis |muon recupera tots els registres que contenen, com a mínim, una de les dues paraules
ellis or muon ídem, "syntactic sugar"
(*)Nota de la traducció catalana: syntactic sugar es una espressió emprada en anglès per a descriure "altre forma similar de cridar una funció"

Logical operations are automatically chained from left to right. For example, if you want to search for documents written by Ellis on muons or kaons, write:

which looks for (muon or kaon) and ellis. Note that this gives different results from:
which would search for (ellis and muon) or kaon.

The left-to-right chaining behaviour permits you to easily refine your search by adding/removing words with and/not or +/- operators. For example, to exclude the documents on decay from the above search, append -decay:

to get a refined list. Keep adding/removing terms until you are satisfied.

Les opérations logiques sont chaînées de gauche à droite. Par exemple, si vous cherchez les documents écrits par Ellis sur les muons ou les kaons, écrivez:

qui recherche pour (muon or kaon) and ellis. Notez que cela donne un résultat différent de:
qui cherche pour (ellis and muon) or kaon

Ce chaînage de gauche à droite permet de facilement améliorer votre recherche en ajoutant/supprimant des mots avec les opérateurs and/not ou +/-. Par exemple, pour exclure les documents sur le "decay" de la recherche ci-dessus, ajoutez -decay:

pour obtenir une liste plus précise. Ajoutez/supprimez d'autres termes jusqu'à satisfaction.

Los operadores lógicos se disponen de forma automática de izquierda a derecha. Logical operations are automatically chained from left to right. Por ejemplo, si queremos buscar documentos escritos por Ellis sobre muon o kaon, escribiremos:

ello nos realizará una búsqueda equivalente a (muon O kaon) Y ellis. Nótese también que se obtienen resultados diferentes si introducimos
que si introducimos (ellis AND muon) OR kaon.

La disposición de los elementos de izquierda a derecha permite refinar fácilmente una búsqueda añadiendo o quitando palabras utilizanto los operadores AND/NOT +/-. Por ejemplo, para excluir los documentos sobre desintegración en la búsqueda anterior, sólo hay que añadir -decay:

y con ello obtendremos una lista más filtrada. Se pueden añadir o quitar tantos elementos como sea conveniente hasta conseguir mayor precisión en la búsqueda.

Els operadors lògics es disposen de forma automàtica d’esquerre a dreta. Per exemple, si es volen recuperar documents escrits per Ellis sobre muon o kaon, escriurem:

es realitzarà una cerca equivalent a (muon OR kaon) AND ellis. Notis també que s’obtenen resultats diferents si s’introdueix
que si s’introdueix (ellis AND muon) OR kaon.

La disposició dels elements d’esquerre a dreta permet refinar fàcilment una cerca afegint o traient paraules utilitzant els operadors AND/NOT +/-. Per exemple, per excloure els documents sobre desintegració a la cerca anterior, només cal afegir -decay:

Amb el que s’obtindria una llista més filtrada. Es poden afegir o traure tants elements com sigui convenient, fins aconseguir més precisió a la cerca.

Parentheses

You can also use parentheses in your queries to group boolean expressions together:

This query returns records containing either gravity or supergravity, and either ellis or perelstein anywhere in the record.

Note that you can use any number of parentheses in the query, but nesting of parentheses, such as foo AND (bar OR (fuux NOT quux)), is not supported.

Special characters and punctuation Caractères spéciaux et ponctuation Spezielle Zeichen und Notation Caracteres especiales y puntuación Caracters especials i puntuació

When indexing words, an attention is paid to index it both with and without punctuation, so that you should be able to search for terms containing special characters, such as C++, verbatim:

For example, to find records containing the LaTeX expression $e^{+}e^{-}$ in the title, type:
For example, to find document with the report number hep-ph/0204133, type:
Note that the search is case-insensitive:

Lors de l'indexation des mots, nous veillons à les saisir avec et sans ponctuation, afin qu'il soit possible de rechercher des termes contenant des caractères spéciaux, comme dans C++:

Par exemple, pour trouver les notices contenant l'expression LaTeX $e^{+}e^{-}$ dans le titre, saisissez:
Par exemple, pour trouver le rapport avec le numéro hep-ph/0204133, saisissez:
Notez que la recherche est insensible à la casse:

Cuando se indexan las palabras, se pone especial atención en indexarlas con y sin puntuación. Ello hace posible la búsqueda de términos que contienen caracteres especiales, como por ejemplo C++, textualmente:

Por ejemplo, para recuperar registros que contengan la expresión LaTeX $e^{+}e^{-}$ en el título, debemos escribir:
Por ejemplo, para recuperar un documento con el número de informe hep-ph/0204133, debemos escribir:
Nótese que esta búsqueda es sensible a las mayúsculas:

Quan s’indexen les paraules, es posa especial atenció en que siguin indexades amb i sense puntuació. Això fa possible la cerca de termes que contenen caràcters especials, com per exemple C++, textualment:

Per exemple, per recuperar registres que contenen l’expresió LaTeX $e^{+}e^{-}$ al títol, cal escriure:
Per exemple, per recuperar un document amb número d'informe hep-ph/0204133, cal escriure:
Observis que aquesta cerca és sensible a les majúscules:

International characters Caractères internationaux Internationale Zeichen Caracteres internacionales Caràcters internacionals

The search engine works with Unicode UTF-8 so you can type your query strings in any language stored in the database. For example, to find the documents written by (or on) Пушкин, type:

Note that you don't have to type accents to find accented results. For example, type Lemaitre to find papers by Lemaître:
IMPORTANT NOTE
Currently, words that include accented characters can only be retrieved by entering accented characters in the query.

Le moteur de recherche prend en compte les caractères Unicode UTF-8, afin de rendre possible la recherche dans n'importe quelle langue. Par exemple, pour trouver les documents écrits par (ou concernant) Пушкин, saisissez:

Notez que les accents sont optionnels. Par exemple, saisissez Lemaitre pour trouver des articles de Lemaître.
NOTE IMPORTANTE
Actuellement, les mots qui contiennent des accents ne peuvent être retrouvés qu'en entrant les accents dans la requête.

El motor de búsqueda se basa en el estandar Unicode UTF-8, lo que hace posible introducir cadenas de búsqueda en cualquier idioma que se encuentre en la base de datos. Por ejemplo, para recuperar documentos escritos por (o sobre) Пушкин, debemos escribir:

Obsérvese que no es necesario introducir acentos para conseguir registros que contienen palabras acentuadas. Por ejemplo, escribiremos Lemaitre para buscar artículos de Lemaître:
NOTA IMPORTANTE
En estos momentos, las palabras que incluyen caracteres acentuados sólo pueden recuperarse entrando los caracteres acentuados en la consulta.

El motor de cerca es basa en l’estàndard Unicode UTF-8, lo que fa possible introduir cadenes de cerca en qualsevol llengua que es trobi a la base de dades. Per exemple, per recuperar documents escrits per (o sobre) Пушкин, cal escriure:

Notis que no és necessari introduir accents per aconseguir registres que contenen paraules accentuades. Per exemple, escriurem Lemaitre para recuperar articles de Lemaître:
NOTA IMPORTANT
En aquests moments, les paraules que inclouen caràcters accentuats només es poden recuperar entrant els caràcters accentuats a la consulta.

Word truncation/stemming Troncature des mots/indexation par radicaux Trunkierung Truncamientos y búsquedas por raíz Truncaments i cerques per arrel

The word truncation is supported via asterisk (*) wildcard character. The wildcard instructs the search engine to match any number of characters in that place. For example, to find records that contain words muon, muons, muonic etc, type:

The wildcard query works both in prefix and infix position. For example, to get all the words that start by CERN-TH and end by 31, type:
Note that the wildcard will be ignored if you try to apply it to very short words, such as a*:
The wildcard character can be used also in the phrase searching mode. For example, to find all the documents whose title starts by "Neutrino mass", type:
Recall that we have introduced exact and partial phrase search modes. Actually, a partial phrase search mode launches an exact search enclosed within wildcards: we could say that 'foo bar baz' equals to "*foo bar baz*". Now you can see why the partial phrase search is slow: due to the usage of two asterisks in front and after the text, each and every title in the database has to be looked up to determine whether it matches or not. (There are currently no partial phrase indexes.)

La troncature des mots est symbolisée à l'aide d'un caractère de remplacement (un astérisque (*)). Le caractère de remplacement indique au moteur de recherche que le caractère à cet emplacement peut être de n'importe quel type. Par exemple, pour trouver les résultats contenant les mots muon, muons, muonic etc, saisissez:

Les requêtes à l'aide de caractères de remplacement fonctionnent lorsque le caractère est placé en position de préfixe ou d'infixe. Par exemple, pour obtenir tous les termes commençant par CERN-TH et finissant par 31, saisissez:
Notez que le caractère de remplacement ne sera pas pris en compte s'il est inséré avant ou après une lettre isolée (a*, par exemple):
Le caractère de remplacement peut également être utilisé dans le cadre de la recherche par phrase. Par exemple, pour trouver tous les documents dont le titre commence par "Neutrino mass", saisissez:
Nous vous avons déjà expliqué les différences qui existent entre les modes de recherche de phrases exactes et de phrases partielles. On peut dire qu'un mode de recherche de phrases partielles lance une recherche exacte incluse entre des caractères de remplacement: 'foo bar baz' est ainsi l'équivalent de '*foo bar baz*'. Vous comprenez maintenant pourquoi la recherche de phrases partielles est plus lente: en raison de l'utilisation de deux astérisques au début et à la fin du texte de recherche, tous les titres de la base de données doivent être examinés afin de déterminer s'ils correspondent ou non (il n'existe actuellement pas d'index des phrases partielles).

El truncamiento de una palabra se efectua a través del uso del asterisco (*). Este comodín indica al motor de búsqueda que retorne cualquier número de carácteres en su lugar. Por ejemplo, para recuperar registros que contengan las palabras muon, muonic, muons etc, debemos escribir:

La búsqueda por comodín puede utilizarse tanto en posición de prefijo como de sufijo. Por ejemplo, para recuperar todas las palabras que comiencen por CERN-TH y acaben con 31, debemos escribir:
Obsérvese que el comodín se ignorará si tratamos de utilizarlo en palabras demasiado cortas, como por ejemplo a*:
El carácter comodín también puede usarse en el modo de búsqueda por frase. Por ejemplo, para recuperar todos los documentos cuyo título comience con "Neutrino mass", debemos escribir:
Recordamos que en esta búsqueda hemos introducido las formas de 'frase exacta' y 'parte de la frase'. En realidad, el modo de búsqueda de 'parte de la frase' lanzará una búsqueda exacta sin necesidad de utilizar los comodines: tanto podemos utilizar 'foo bar baz' como "*foo bar baz*". Esto ayuda a comprender porqué la búsqueda de palabras en una parte de la frase es lenta: debido al uso de dos asteriscos delante y detrás del texto, todos y cada de los registros de la base de datos son comprobados para determinar si se corresponden a la búsqueda o no. Actualmente no hay ningún índice de búsqueda que indexe partes de frase.

El truncament d’una paraula es realitza a través de l’ús del símbol asterisc (*). Aquest comodí indica al motor de cerca que retorni qualsevol número de caràcters en el seu lloc. Per exemple, per recuperar registres que continguin les paraules astronomia, astrònoms, astronòmics etc, cal escriure:

La cerca por comodí pot utilitzar-se tant en posició de prefix com de sufix. Per exemple, per recuperar totes les paraules que comencin per CERN-TH i acabin amb 31, cal escriure:
Notis que el comodí s’ignora si es tracta d’utilitzar-lo amb paraules massa curtes, com per exemple a*:
El caràcter comodí també pot utilitzar-se al mode de cerca per frase. Per exemple, per recuperar tots els documents el títol dels quals comenci per "Neutrino mass", cal escriure:
Recordem que a aquesta cerca hem introduït les formes de 'frase exacta' i 'part de la frase'. En realitat, el mode de cerca per 'part de la frase' ens llençarà una cerca exacta sense necessitat de fer servir els comodins: tant podem utilitzar 'foo bar baz' com "*foo bar baz*". Això ajuda a comprendre perquè la cerca de paraules a una part de la frase és lenta: degut a l’ús de dos asteriscs davant i darrera del text, tots i cadascun dels registres de la base de dades es comproven per determinar si es corresponen o no amb la cerca introduïda. Actualment no hi ha cap índex de cerca que indexi parts de frase.

Structured metadata search Recherche structurée Strukturierte Metadatensuche

Searching within various bibliograpic fields (such as title, author) is supported via Google's "site:" like syntax. If a search term is preceded by a field name and a colon, then the term is searched for inside this field only. For example, to find documents containing the word ellis within author index, type:

To select documents written by Ellis that contain words like muon, muons, muonic within title, type:
To select documents written by the NA60 experiment from the year 2001, type:
The most common fields you may want to use are author, title, reportnumber, abstract, keyword, year, experiment, fulltext, and reference.

La recherche au sein de différents champs bibliographiques (titre, auteur, etc.) est prise en charge de manière similaire à la syntaxe de Google. Si un élément de recherche est précédé d'un nom de champ et de deux points, le terme n'est recherché que dans le champ donné. Par exemple, pour trouver les documents contenant le mot ellis dans l'index des auteurs, saisissez:

Pour sélectionner les documents écrits par Ellis dont le titre contient des mots tels que muon, muons, muonic, saisissez:
Pour sélectionner les documents écrits par l'expérience NA60 en 2001, saisissez:
Les champs les plus fréquemment utilisés sont les suivants: author (auteur), title (titre), reportnumber (num. de rapport), abstract (résumé), keyword (mot-clef), year (année), experiment (expérience), fulltext (contenu), and reference (référence).

La búsqueda realizada utilizando varios campos bibliográficos (tales como título, autor) se realiza a través de una sintaxis similar a la que utiliza Google en sus búsquedas tipo "site:". Si un término de búsqueda es precedido por un nombre de campo seguido por dos puntos este término se buscará únicamente en el campo especificado. Por ejemplo, para recuperar documentos que contienen la palabra ellis dentro del índice de autores, debemos escribir:

Para recuperar documentos escritos por Ellis que contengan palabras tales como muon, muonic, muons dentro del título, debemos escribir:
Para recuperar documentos escritos por el experimento NA60 del año 2001, debemos escribir:
Los campos de uso más común son los siguientes: author, title, reportnumber, abstract, keyword, year, experiment, fulltext, and reference.

La cerca realitzada utilitzant diversos camps bibliogràfics (Tals com títol, autor) es realitza a través d’una sintaxi similar a l’emprada per Google en les Seves cerques tipus "site:". Si un terme de cerca es precedit per un nom de camp seguit per dos punts aquest terme es cerca únicament dins el camp especificat. Per exemple, per recuperar documents que contenen la paraula ellis dins l’índex d’autors, cal escriure:

Per recuperar documents escrits per Ellis que continguin paraules tals com muon, muonic, muons dins el títol, cal escriure:
Per recuperar documents escrits per l’experiment NA60 de l’any 2001, cal escriure:
Els camps d’ús comú amb aquesta tècnica són: author, title, reportnumber, abstract, keyword, year, experiment, fulltext, and reference.

Regular expressions Expressions régulières Expresiones regulares Expressions regulars

The regular expression searching mode is mostly for the power users acquainted with the traditional Unix/POSIX regexp syntax. In the Simple Search interface you can trigger it by using slashes instead of quotes:

while in the Advanced Search interface you can select the matching type explicitely by using the selection box menu. The above example will find all the titles that start by the letter E, followed by any number of any characters, and end by the letter s.

Les expressions régulières sont réservées à l'usage des utilisateurs avancés, familiers de la syntaxe des regexp Unix/POSIX. Dans le mode de Recherche Simple, on définit une expression régulière en substituant le signe / aux guillemets:

alors que dans le mode de Recherche Avancée il est possible de sélectionner explicitement la recherche par expressions régulières dans la liste des types de recherches. L'exemple ci-dessus va trouver tous les titres commençant pas la lettre E, suivi par n'importe quel nombre de caractères et se terminant par la lettre s.

El uso del modo de búsqueda por expresión regular está dirigida sobre todo a usuarios avanzados que conocen la sintaxis tradicional Unix/POSIX regexp. En el interfaz de consulta simple se puede forzar este modo usando barras inclinadas en lugar de comillas:

mientras que en el modo de búsqueda Avanzado se puede seleccionar explícitamente este modo de búsqueda seleccionandolo en el menú desplegable. En el ejemplo anterior se recuperarían todos los títulos que comienzan por la letra E, seguida de cualquier número o cualquier carácter, y que terminen por la letra s.

L’ ús del mode de cerca per expressió regular està dirigida sobretot a usuaris avançats que coneixen la sintaxi tradicional Unix/POSIX regexp. A l’interficie de consulta simple es pot forçar aquest mode utilitzant les Barres inclinades enlloc de comentes:

Al mode de cerca Avançada es pot seleccionar explícitament aquest mode de cerca seleccionant-lo al menú desplegable. a l’exemple anterior es recuperarien tots els títols que comencen per la lletra E, seguida de qualsevol número o qualsevol caràcter, i que acabin per la lletra s.

Another example could be an author search for an author expressed in the database as either Ellis, J or Ellis, John:

Autre exemple: la recherche d'un auteur dont le prénom peut être abrégé ou entier, comme Ellis, J ou Ellis, John:

Otro ejemplo similar es el de la búsqueda por autor de un autor que se ha introducido en la base de datos como Ellis, J o bien como Ellis, John:

Un altre exemple similar és el de la cerca per autor d’un autor que ha estat introduït a la base de dades com Ellis, J o bé com Ellis, John:

The regular expression search enables you to formulate very specific word proximity queries. For example, let us find all titles containing words dense and matter that are separated by at most one word that doesn't contain the letter l:

Une expression régulière vous permet de définir des paramètres de proximité entre plusieurs termes. Par exemple, rechercher tous les titres contenant les mots dense et matter séparés par un seul mot ne contenant pas la lettre l:

La búsqueda por expresión regular permite formular consultas de palabras muy específicas por su proximidad. Por ejemplo, permite recuperar registros cuyos títulos contienen las palabras dense y matter y que estén separadas al menos por una palabra que no contenga la letra l:

La cerca per expressió regular permet formular consultes de paraules Molt específiques per proximitat. Per exemple, permet recuperar registres Els títols dels quals contenen les paraules dense i matter i que estiguin separades, al menys, per una paraula que no contingui la lletra l:

Note that you can also use character intervals such as [a-k] and occurrence counts such as {3}. For example, let us find all preprints that do not follow the year cataloguing policy, that is YYYY to denote year, optionally followed by ? or by another -YYYY:

You can use also character classes such [:alnum:], so that the above query is equivalent to:

A noter que vous pouvez utiliser des intervalles comme [a-k] ou [0-9] et définir la longueur du mot, {3} p.ex., chercher l'ensemble des "Preprints" dont la date ne suit pas la règle de catalogage, soit YYYY pour spécifier une l'année de publication, éventuellement suivi par ? ou une autre année -YYYY (p.ex. 2003-2004 sera exclus) :

Il est aussi possible d'utiliser des "familles" de caractères, tel que [:alnum:], de telle sorte que la requête ci-dessus est équivalente à:

Obsérvese que también pueden utilizarse carácteres de intervalo como [a-k] y contadores de ocurrencia como {3}. Por ejemplo, podemos recuperar todos los pre-prints que no se corresponden con la política de catalogación AAAA para describir los años, opcionalmente seguida por ? o por otro -AAAA:

También pueden utilizarse otras clases de caracteres como [:alnum:], el cual en el ejemplo anterior sería equivalente a:

Observis que també poden emprar-se caràcters d’interval com [a-k] i contadors de concurrència com {3}. Per ejemple, podem recuperar tots els pre-prints que no es corresponguin amb la política de catalogació AAAA per descriure anys, opcionalment seguida per ? o per altre -AAAA:

També poden emprar-se altres classe de caràcters com [:alnum:], que a l’exemple anterior seria equivalent a:

To learn more about POSIX regular expressions, please consult the Wikipedia regexp article and the MySQL regexp documentation.

Pour plus d'informations sur les expressions régulières, voir les articles de Wikipedia et MySQL à ce sujet.

Para conocer más acerca de las expresiones regulares POSIX, se recomienda la consulta de Wikipedia regexp article y MySQL regexp documentation.

Per conèixer més sobre les expressions regulars POSIX, es recomana la Consulta de Wikipedia regexp article i MySQL regexp documentation.

Span queries Requêtes de plages Bereichs-Recherche Consultas por rango Consultes per rang

The span query is provided via a -> sign. For example, to search for all documents on muon decay published between 1983 and 1992, type:

- +
To find all documents by authors with names ranging from Ellis, J to Ellis, Qqq, type:
- +

La requête de plages est disponible via le signe ->. Par exemple, pour rechercher tous les documents relatifs au muon decay publiés entre 1983 et 1992, saisissez:

- +
Pour trouver tous les documents dont l'orthographe du nom de l'auteur est comprise entre Ellis, J to Ellis, Qqq, saisissez:
- +

La consulta por rango se realiza a través del signo ->. Por ejemplo, para recuperar todos los documentos sobre par motor publicados entre 1983 y 1992, debemos escribir:

- +
Para recuperar todos los documentos de autores cuyos nombres estén dentro del rango que va de Ellis, J a Ellis, Qqq, debemos escribir:
- +

La consulta por rang es realitza a través del signe ->. Per exemple, per recuperar tots els documents sobre par motor publicats entre 1983 i 1992, cal escriure:

- +
Per recuperar tots els documents d’autors els noms dels quals estiguin dins El rang que va des de Ellis, J fins Ellis, Qqq, cal escriure:
- +
+

+ +Refersto and citedby search operators + +

+ + + +

It is possible to search in citation network by means + of citedby and refersto search operators. + For example, to find out who cites hep-th/0201100, you can type:

+ +
+ +
+ + + +
+
+ +

For example, to find out which papers are cited by Klebanov, you can type:

+ +
+ +
+ + + +
+
+ +

To set up a cite alert for new papers citing author I. Klebanov, you can type:

+ +
+ +
+ + + +
+
+ +

Note that refersto and citedby search + operators work on any regular query. For example, to find all + papers that cite papers that are tagged with the gravitino keyword, + type:

+ +
+ +
+ + + +
+
+ +

Note also that these operators can be freely combined with + regular metadata search. For example, to find papers authored by + Klebanov that are cited by Papadimitriou but that do not cite any + of Papadimitriou's papers themselves, type:

+ +
+ +
+ + + +
+
+ +
+
+

Combined metadata/fulltext/citation search Recherche combinée métadonnées/fulltext/citation Kombinierte Metadaten-/Volltext-/Zitatsuche Combinación de metadatos/texto completo/cita bibliográfica Combinació de metadades/text complet/referència bibliogràfica

All the syntax mentioned above can be combined together in one query. For example, to find documents that have the word ellis inside author fields, that do not contain words like muon, 'muonic' etc in any field, that contain the phrase (or the substring, to be more precise) 'dense quark matter' inside abstract fields, and that were published in year starting by digits '200', type:

Note that the default "any field" global index does contain only the metadata terms, not the citation nor fulltext terms. You have to explicitely mention fulltext or reference index to search there. For example, to find the term Higgs in either metadata, references or fulltext files, type:
This permits an interesting combination of metadata, fulltext and citation search in the same query. For example, to get all documents written by Lin whose fulltext files contain the words Schwarzschild and AdS, and who cite journal Adv. Theor. Math. Phys., type:

Les syntaxes détaillées ci-dessus peuvent être combinées dans une seule requête. Par exemple, pour trouver les documents dont le champ d'auteur contient le mot ellis, dont aucun des champs ne contient des mots tels que muon, 'muonic' etc. et dont le champ de résumé contient l'expression (ou sous-chaîne, pour être plus précis) 'dense quark matter', et qui ont été publiés dans les années 2000 et plus, saisissez:

Notez que la recherche dans "tous les champs" ne s'effectue que sur les métadonnées, et non pas sur les citations, ni le contenu des documents. Il vous faut explicitement indique une recherche dans les champs fulltext ou reference pour chercher dans ces données. Par exemple, pour trouver le terme Higgs dans les métadonnées, les références et le contenu des fichiers, saisissez:
Ceci permet de combiner la recherche dans les métadonnées, le contenu des fichiers et les citations dans une seule requête. Par exemple, pour trouver tous les documents écrits par Lin dont les fichiers contiennent les mots Schwarzschild et AdS, et citant le journal Adv. Theor. Math. Phys., saisissez:

Toda la sintaxis mencionada anteriormente se puede combinar en una misma consulta. Por ejemplo, para recuperar documentos que contienen la palabra ellis dentro del campo autor, y que no contienen palabras como muon, 'muonic', etc., en ningún otro campo, y que contienen (o subcadena de palabras, para ser más precisos) ‘dense quark matter’ dentro del campo de resumen, y que han sido publicados dentro de los años que comienzan por los dígitos '200', debe escribirse:

Obsérvese que la opción de índice global “cualquier campo” por defecto contiene únicamente términos de metadatos, no del texto completo ni de las citas bibliográficass. Es necesario especificar explícitamente las opciones de buscar en el índice texto completo o de cita bibliográfica si se desea buscar en estos campos. Por ejemplo, para recuperar el término Higgs tanto dentro de los metadatos, las citas o el texto completo, debe escribirse:
Ello permite una interesante combinación de metadatos, texto completo y citas dentro de una misma búsqueda. Por ejemplo, para recuperar todos los documentos escritos por Lin que en el texto completo contengan las palabras Schwarzschild y AdS, que citen la revista Adv. Theor. Math. Phys., debe escribirse:

Tota la sintaxi mencionada anteriorment es pot combinar en una mateixa consulta. Per exemple, per recuperar documents que contenen la paraula ellis dins el camp d’autor, i que no contenen paraules com muon, 'muonic', etc., a cap altre camp, i que contenen la frase (o subcadena de paraules, per ser més precisos) ‘dense quark matter’ dins el camp de resum, i que han estat publicats dins els anys que comencen pels dígits '200', cal escriure:

Observis que l’opció d’índex global “qualsevol camp” per defecte conté només els termes de metadades, no del text complet ni de les referències bibliogràfiques. Cal especificar explícitament les opcions de cercar a l’índex text complet o de referència bibliogràfica si volem cercar per aquests camps. Per exemple, per recuperar el terme Higgs tant dins les metadades, les referències o el text complet, cal escriure:
Això permet una interessant combinació de metadades, text complet i referències dins una mateixe cerca. Per exemple, per recuperar tots els documents escrits per Lin que en el text complet continguin les paraules Schwarzschild and AdS, que citi la revista Adv. Theor. Math. Phys., cal escriure:

Frequently asked questions Foire aux questions FAQ Preguntas frecuentes Preguentes freqüents

How to wisely choose your search terms (speed-wise) Comment sélectionner vos termes de recherche de manière intelligente (en termes de vitesse) Wie wähle ich am geschicktesten meinen Suchbegriff Escoger correctamente los términos de búsqueda (speed-wise) Escollir correctament els térmes de cerca (speed-wise)

  • Whenever possible, prefer word searches instead of phrase searches. Search rather for black hole than for "black hole".
  • Avoid common terms such as and, of, or CERN.
  • If you are searching for a specific metadata information, such as a report number, choose corresponding index.
  • If you are looking for a specific document collection, such as Theses, choose the Theses collection first, and start your search from there.
  • Dans la mesure du possible, recherchez plutôt des termes que des phrases. Recherchez plutôt black hole que "black hole".
  • Evitez des termes courants tels que et, de ou CERN.
  • Si vous recherchez une information spécifique au niveau des métadonnées, telle qu'un numéro de rapport, sélectionnez l'index correspondant.
  • Si vous recherchez une collection spécifique de documents (Thèses, par exemple), commencez la recherche dans la collection Thèses.
  • Siempre que sea posible, es preferible realizar búsquedas por palabra antes que por frase. Es preferible buscar por agujero negro que por "agujero negro".
  • Se aconseja evitar vocablos comunes tales como y, de, o siglas como CERN.
  • Si la búsqueda versa sobre información específica en los metadatos, como por ejemplo, número de registro, seleccionar el índice correspondiente.
  • Si la búsqueda versa sobre una colección específica de documentos, como por ejemplo, Tesis, elegir en primer lugar la colección Tesis, e iniciar la búsqueda a partir de aquí.
  • Sempre que sigui possible, es preferible realitzar cerques per paraula abans que per frase. Es preferible una cerca per forat negre que per "forat negre".
  • S’aconsella evitar mots comuns tals com les preposicionsi, de, o sigles com CERN.
  • Si la cerca versa sobre informació específica a les metadades, com per exemple, número de registre, cal seleccionar l’índex corresponent.
  • Si la cerca versa sobre una col•lecció específica de documents, com per exemple, Tesis, cal escollir en primer lloc la col•lecció Tesis, e iniciar la cerca a partir de aquí.

How to search for publications by a given author Comment rechercher les publications d'un auteur donné Wie suche ich nach Publikationen eines bestimmten Autors ¿Como localizar publicaciones a partir del autor? Com localitzar publicacions a partir de l'autor?

You can search for an author in many ways, each having its own advantages and disadvantages.

  1. First of all, note that searching for words isn't usually what you would want here. If you choose to search for the words Ellis J within the author index, it means that two queries (for the words Ellis and J) are effected first and a boolean AND is performed next:

    Such a query would match also a document whose first author is Ellis, R and the second author Finch, A J, which is probably not what you wanted. While the search is very fast and you would have found the results for the author you were looking for, such a technique could have returned you many false positives, as the one cited above. Instead of searching for words, a more suitable technique to apply in this case is to search for phrases which will permit you to achieve higher search precisions.

  2. The author names are usually stored in a form containing initials only, such as Ellis, J. To get the list of publications of an author whose name is spelled exactly that way, type:

    Il existe différentes méthodes de recherche d'un auteur. Chaque méthode a ses avantages et ses inconvénients.

    1. Tout d'abord, notez que les recherches basées sur des termes ne sont généralement pas adaptées. Si vous recherchez les mots Ellis J dans le champ de l'auteur, deux requêtes (pour les mots Ellis et J) sont d'abord effectuées. Le booléen ET n'est utilisé que par la suite:

      Une telle requête inclut également les documents dont le premier auteur est Ellis, R et le second Finch, A J. Or il est possible qu'il ne s'agisse pas du résultat que vous recherchez. La recherche est très rapide et peut vous présenter un certain nombre de résultats correspondant à l'auteur recherché. Il est néanmoins également possible qu'une telle technique vous renvoie de nombreuses réponses incorrectes, comme celle susmentionnée. Dans un tel cas, il est plus adapté de rechercher des phrases plutôt que des termes. Vous pouvez ainsi affiner votre recherche.

    2. Les noms d'auteurs sont généralement stockés sous une forme contenant uniquement des initiales, telle que Ellis, J. Pour obtenir la liste des publications d'un auteur dont le nom est orthographié de cette manière, saisissez:

      La búsqueda por autor puede realizarse de diversas formas, cada una de las cuales presenta sus ventajas y desventajas.

      1. En primer lugar, es necesario anotar que el modo de búsqueda por palabras no es el que más adecuado en este tipo de búsqueda. Si se selecciona la búsqueda por palabras Ellis J dentro del índice de autores, ello se interpretará como dos consultas (para las palabras Ellis y J) que se efectuarán en primer lugar, y una booleana Y que se realizará después.

        La consulta realizada de este modo recuperará también un documento que presenta como primer autor a Ellis, R y como segundo autor Finch, A J, lo cual, probablemente, no es lo deseado. La búsqueda se ha realizado muy rápidamente y se han recuperado registros sobre el autor requerido, sin embargo, los resultados recuperados siguiendo este método pueden no ser pertinentes, tal y como ocurre en el ejemplo citado anteriormente. En lugar de buscar por palabras, la técnica más conveniente que debe aplicarse en este caso es la búsquda por frase, que permite conseguir mayor precisión.

      2. Es habitual que los nombres de autores que se introducen a través de los formularios contengan únicamente la inicial del nombre, como por ejemplo, Ellis, J. Para obtener un listado de las publicaciones de dicho autor con el nombre escrito exactamente de ese modo, debemos escribir:

        La cerca per autor es pot realitzar de diverses formes, cadascuna de les quals presenta els seus avantatges i les seves desavantatges.

        1. En primer lloc, cal notar que el mode de cerca per paraules no és el més adient en aquests casos. Si es selecciona la cerca per paraules Ellis J dins l’índex d’autors, s’interpretarà como a dues consultes (per a les paraules Ellis i J) que es realitzaran en primer lloc, i una booleana I que es realitzarà després.

          La consulta realitzada d’aquesta forma recuperarà també un document que tingui com a primer autor a Ellis, R i com a segon autor a Finch, A J, fet que, probablement, no es el desitjat. La cerca s’ha realitzat amb molta rapidesa i s’han recuperat registres sobre l’autor requerit, ara bé, els resultats obtinguts seguint aquest métode poden no ser pertinents, tal i com passa a l’exemple citat anteriorment. En lloc de cercar per paraules, la tècnica més convenient que s’ha d’aplicar en aquests casos és la cerca per frase, que permet aconseguir major grau de precisió.

        2. Es habitual que els noms d’autors que s’introdueixen a través dels formularis continguin únicament la inicial del nom, com per exemple, Ellis, J. Per obtenir un llistat de les publicacions de tal autor amb el nom escrit exactament d’aquesta manera, cal escriure:

          This way of searching gives you the highest precision and no false positives. (Assuming there are no other authors whose names are spelled Ellis, J, an assumption that is often false*.) The search is very fast.

        3. Sometimes an author's first name may be spelled abbreviated on some documents (such as Ellis, J) and sometimes full on others (such as Ellis, John; eventually also with the middle name: Ellis, John Rolfe). To get the list of publications for all these forms at the same time, you could use a boolean OR query:

          This way of searching still keeps the highest precision and no false positives. (Assuming there are no other authors whose names are spelled Ellis, J or Ellis, John, an assumption that is often false*.) The search is fast.

        4. To match all of the above forms in a single search term, you can try to use a wildcard query:

          It would match all author names that start by the text Ellis, J, i.e. not only the wanted forms Ellis, J and Ellis, John, but also Ellis, Jim, or Ellis, John Rolfe, or Ellis, Jonathan Richard.

          This way of searching returns you more results, which may be suitable in case you don't know how the names are spelled in the database. But you also risk the eventuality of getting false positives. The search is relatively fast.

        5. Yet another, the most general alternative is to use a partial phrase matching:

          It would find not only all the authors mentioned above, but also the ones whose names contain the expression Ellis, J anywhere inside the name, such as De Lellis, Jim. It thus gives you the largest possible number of hits at the largest risk of false positives. The search is relatively slow.

          (Note though that this way of searching may be very handy in case of compound family names such Pepe-Altarelli, M or 't Hooft, G where a casual user query for Hooft, G would match the wanted author, unlike the methods mentioned above.)

        6. Finally, let us note that you can use the regular expression syntax to construct any complex author query. A simple example is to search for an author expressed in the database as either Ellis, J or Ellis, John:
          Please consult regular expression searching tips to know more about regular expression search possibilities.

        *NOTE: If you produce your own list of publications and you notice that sometimes your first name is spelled abbreviated and sometimes in full, or if you want to identify your publications among several authors with the same abbreviation, please contact the administrators of so that they could work with you on inputting a consistently spelled and properly formatted first name everywhere. Only the consistent database content will ensure a proper author searching behaviour.

        Cette méthode de recherche vous permet de procéder à une recherche très précise et d'éviter les réponses incorrectes (si l'on part du principe, souvent erroné, qu'aucun autre auteur dont le nom est orthographié Ellis, J existe dans la base de données). La recherche est très rapide.

      3. Il arrive que le prénom de l'auteur soit abrégé sur certains documents (Ellis, J) et complet sur d'autres (Ellis, John), éventuellement accompagné d'un second prénom (Ellis, John Rolfe). Pour obtenir la liste de publications correspondant à l'ensemble de ces formes, vous pouvez utiliser une requête booléenne OU:

        Cette méthode de recherche vous permet toujours de procéder à une recherche très précise et d'éviter les réponses incorrectes (si l'on part du principe, souvent erroné, qu'aucun autre auteur dont le nom est orthographié Ellis, J ou Ellis, John existe dans la base de données *). La recherche est rapide.

      4. Pour rassembler l'ensemble des formes susmentionnées dans une seule recherche de termes, vous pouvez baser la requête sur un caractère de remplacement:

        L'ensemble des noms d'auteurs qui commencent par Ellis, J, i.e. non seulement les formes recherchées Ellis, J et Ellis, John, mais aussi Ellis, Jim, ou Ellis, John Rolfe, ou Ellis, Jonathan Richard est affiché.

        Ce mode de recherche permet d'obtenir plus de résultats. Il peut être utile lorsque vous ne connaissez pas l'orthographe des noms dans la base de données. Vous risquez néanmoins d'obtenir des réponses incorrectes. La recherche est relativement rapide.

      5. L'alternative la plus fréquemment utilisée reste néanmoins la recherche de phrases partielles:

        Ce mode permet non seulement de trouver tous les auteurs mentionnés ci-dessus mais également ceux contenant l'expression Ellis, J au sein du nom, tels que De Lellis, Jim. Vous disposez ainsi du plus grand nombre possible de résultats. Le risque de réponses incorrectes est néanmoins très élevé. La recherche est relativement lente.

        (Notez que cette méthode de recherche peut néanmoins s'avérer très utile lors de la recherche de noms de famille composés tels que Pepe-Altarelli, M or 't Hooft, G, où une requête normale pour Hooft, G permet de trouver l'auteur recherché, contrairement aux méthodes susmentionnées).

      6. Finalement, notez que vous pouvez utiliser les expressions régulières pour construire n'importe quelle recherche d'auteur, même complexe. Un exemple simple est la recherche d'un auteur dont le nom est archivé en tant que Ellis, J ou Ellis, John:
        Consultez les astuces sur les expressions régulières pour en apprendre plus sur les possibilités qu'elles offrent.

      *Remarque: si vous créez votre propre liste de publications et remarquez que le prénom est parfois épelé de manière abrégée et d'autres fois de manière complète ou si vous souhaitez identifier vos publications parmi celles de différents auteurs disposant du même nom en abrégé, veuillez contacter les administrateurs de de manière à ce qu'ils vous aident à saisir les prénoms de manière correcte et cohérente dans l'ensemble de la base de données. Une base de données au contenu cohérent garantit des résultats satisfaisants lors de la recherche d'auteurs.

      Esta técnica de búsqueda confiere mayor precisión y evita falsos resultados aparentemente positivos. (La presunción de que no existen otros autores cuyo nombre se escriba como Ellis, J, en ocasiones puede no ser cierta*.) La búsqueda se realizará con mucha rapidez.

    3. Algunas veces, en algunos documentos, el nombre propio de un autor se escribe de forma abreviada, (como Ellis, J) y otras de forma completa (como Ellis, John; y en algunas ocasiones también con el nombre compuesto:Ellis, John Rolfe). Para obtener una lista de publicaciones del autor con todas sus formas en una misma búsqueda, se puede utilizar el operador booleano O:

      Este modo de búsqueda nos ofrece aún mayor precisión y evita falsos resultados positivos (o no pertinentes). (La presunción de que no hay más autores que se escriban como Ellis, J O Ellis, John, en ocasiones puede no ser cierta *.) La búsqueda es muy rápida.

    4. Para localizar todas las formas introducidas en el ejemplo anterior con un único término de búsqueda, podemos probar de utilizar una búsqueda con comodín:

      Ello recuperará todos los nombres de autor que comienzan con el texto Ellis, J, no sólo con las formas requeridas en el ejemplo Ellis, J y Ellis, John, sino también Ellis, Jim, o Ellis, John Rolfe, o Ellis, Jonathan Richard.

      Esta búsqueda retornará muchos más resultados, pero puede resultar conveniente en el caso en que se desconozca la forma de introducción de los nombres en la base de datos. Sin embargo, aumenta el riesgo de obtener resultados poco pertinentes. La búsqueda es relativamente rápida.

    5. Aparte de esta última, la alternativa general más utilizada es el uso de una búsqueda por parte de la frase:

      Esta búsqueda recupera no sólo todos los autores mencionados, sino también todos aquellos cuyos nombres contengan la expresión Ellis, J en cualquier parte de su nombre, como por ejemplo De Lellis, Jim. Ello aumenta enormemente las posibilidades de obtener registros con un alto riesgo de ser poco pertinentes. La búsqueda es relativamente lenta.

      (Obsérvese que este modo de búsqueda puede resultar muy práctico para apellidos compuestos tales como Pepe-Altarelli, M o 't Hooft, G en la que una consulta puntual de usuario por Hooft, G recuperará el autor requerido, a diferencia de los métodos mencionados anteriormente.)

    6. Finalmente, anotar que es posible utilizar la sintaxis de expresión regular para construir consultas complejas para buscar por autor. Un ejemplo sencillo es la búsqueda de un autor introducido en la base de datos tanto como Ellis, J como Ellis, John:
      Se recomienda la consulta de búsqueda por expresiones regulares para conocer más acerca de las posibilidades de este modo de búsqueda.

    *NOTA: Si es autor de su propia lista de publicaciones y detecta que algunas veces su nombre se escribe de forma abreviada y a veces completo, o si desea identificar sus publicaciones entre diversos autores con la misma forma abreviada, por favor contacte con losadministradores de que trataran de trabajar conjuntamente para implementar una forma consistente y normalizada de introducción de su nombre en toda la base de datos. Sólo una base de datos con contenido consistente puede garantizar una búqueda por autores exitosa.

    Aquesta tècnica de cerca confereix major grau de precisió i evita falsos resultats aparentement positius. (La presumpció de que no existeixen altres autores els noms dels quals s’escrigui com Ellis, J, en ocasions pot no ser certa*.) La cerca es realitzarà con amb molta rapidesa.

  3. Algunes vegadess, en alguns documents, el nom propi d’un autor s’escriu de forma abreujada, (com Ellis, J) i altres de forma completa (com Ellis, John; i en altres ocasions també amb el nom compost:Ellis, John Rolfe). Per obtenir una llista de publicacions de l’autor amb totes les seves formes en una mateixa cerca, es pot utilitzar l’operador booleà O:

    Aquest mode de cerca ens ofereix encara major grau de precisió i evita falsos resultats positius (o no pertinents). (La presumpció de que no hi ha més autors que s’escriguin com Ellis, J O Ellis, John, en ocasions pot no ser certa *.) La cerca és molt ràpida.

  4. Per localitzar totes les formes introduïdes a l’exemple anterior amb un únic terme de cerca, podem provar d’emprar una cerca amb comodí:

    Això recuperarà tots els noms d’autor que comencen amb el text Ellis, J, no només amb les formes requerides a l’exemple Ellis, J i Ellis, John, sinó també Ellis, Jim, o Ellis, John Rolfe, o Ellis,Jonathan Richard.

    Aquesta cerca retornarà molts més resultats, però pot resultar convenient en el cas que es desconegui la forma d’introducció dels noms a la base de dades. Ara bé, augmenta el risc d’obtenir resultats poc pertinents. La cerca és relativament ràpida.

  5. Apart d’aquesta darrera, l’alternativa general més emprada és la cerca per part de la frase:

    Aquesta cerca recupera no només tots els autors mencionats, sinó també tots aquells els noms dels quals continguin l’expresió Ellis, J a qualsevol part del seu nom, com per exemple De Lellis, Jim. Això augmenta enormement les possibilitats d’obtenir registres amb un alt risc de ser poc pertinents. La cerca és relativament lenta.

    (Observis que aquest mode de cerca pot resultar molt pràctic per a cognoms compostos, tals com Pepe-Altarelli, M o 't Hooft, G en els que una consulta d’una consulta puntual de usuario por Hooft, G recuperarà l’autor requerit, a diferència dels mètodes mencionats anteriorment.)

  6. Finalment, anotar que es possible emprar la sintaxi de expressió regular per construir consultes complexes per fer cerques per autor. Un exemple senzill és la cerca d’un autor introduït a la base de dades tant per Ellis, J com per Ellis, John:
    Es recomana la consulta de cerca per expressions regulars per conèixer més sobre les possibilitats d’aquest mode de cerca.

*NOTA: Si es autor de la seva pròpia llista de publicacions i detecta que algunes vegades el seu nom s’escriu de forma abreujada i de vegades complet, o si desitja identificar les seves publicacions entre diversos autors amb la mateixa forma abreujada, sis plau, contacti amb els administradors de que tractaran de treballar-hi conjuntament per implementar una forma consistent i normalitzada d’introducció del seu nom a tota la base de dades. Només una base de dades amb contingut consistent pot garantir una cerca per autors exitosa.

How to sort according to a certain pattern Comment trier d'après un certain critère Wie lasse ich Ergebnisse auf eine bestimmte Weise sortieren ¿Cómo ordenar acorde a cierto patrón de ordenación? Com ordenar d'acord a cert patró d'ordenació?

You may select a certain field according to which sort the search results, for example to sort the results by main title. However, sometimes you may want to sort by a report number and it happens that your documents have several of them. For example, the report numbers hep-ph/0204140, CERN-TH-2002-069 and RM3-TH-02-4 all denote the same document. Now if you sort your search results set containing this document, the system will take into consideration the first report number, that may be either of these three. Sometimes you may want to classify this document under its hep-ph number, sometimes under its CERN number, depending on whether you produce a list of CERN or hep-ph publications. How can you influence the search engine to prefer one report number rather than the other?

In other words, the search engine by default answers a query like "sort by first author" or "sort by first report number", but sometimes you may want to ask the search engine to "sort by first report number that starts by the text CERN-". The latter possibility is available via a "silent" sort parameter called sp (for "sort pattern") that sorts preferentially according to the given textual pattern if they can be found. The parameter is "silent" in a way that it is not present in the search interface, you have to add it manually to your search URL. For example, to get all CERN-TH publications of the year 2001 sorted by their CERN-TH numbers, you would search for CERN-TH-2001* within reportnumber index, and on the search results page, being satisfied with the results, you would add &sp=CERN-TH to the URL to sort the results preferentially by CERN-TH report numbers, to get a nicely sorted list of all CERN-TH 2001 publications.

Il est possible de sélectionner le critère d'après lequel les résultats de la recherche sont ordrés, par exemple pour trier les résultats par titre. Cependant, il arrive parfois que l'on cherche à trier selon le numéro de rapport, alors que certains documents en ont plusieurs. Par exemple, les numéros de rapport hep-ph/0204140, CERN-TH-2002-069 et RM3-TH-02-4 appartiennent tous au même document. Si ce document se trouve dans vos résultats, le système ne prendra en compte qu' seul de ces numéros pour le tri. Il arrive que l'on cherche à trier parfois selon la référence hep-ph, et d'autres fois selon la référence CERN, selon que l'on veuille produire une liste des publications CERN ou hep-ph. Comment demander au moteur de recherche d'utiliser un des numéros plutôt que les autres?

En d'autres termes, par défaut le moteur de recherche répond à une requête "trier par le premier auteur" ou "trier par le premier numéro de rapport", mais il arrive qu'il soit nécessaire de demander "trier par le premier numéro de rapport qui commence par le texte CERN-". Cette dernière requête peut être formulée via un paramètre de tri "silencieux" appelé sp (pour "search pattern", motif de tri) qui trie de préférence selon le motif textuel qui lui est fourni. Ce paramètre est "silencieux" dans le sens qu'il n'est pas disponible dans l'interface web, mais doit être ajouté manuellement à l'URL Par exemple, pour obtenir toutes les publications de l'année 2001 triées d'après leur référence CERN-TH, il faut chercher CERN-TH-2001* dans l'index reportnumber, et sur la page des résultats (une fois que ceux-ci sont satisfaisants), ajouter &sp=CERN-TH à l'URL pour obtenir la liste triée selon ce paramètre.

Es posible seleccionar un campo determinado en función del tipo de ordenación de los resultados de búsqueda, por ejemplo para ordenarse los resultados por el título principal. De este modo, en ocasiones puede ocurrir que se desee ordenar los documentos por el código de referencia y que un mismo documento tenga más de uno. Por ejemplo, los códigos hep-ph/0204140, CERN-TH-2002-069 y RM3-TH-02-4 apuntan todos al mismo documento. Si se ordena el conjunto de resultados de búsqueda contenidos en el documento, el sistema tendrá en cuenta el primer número de informe, que puede ser cualquiera de los tres. En ocasiones se puede desear clasificar un documento según un determinado código de referencia hep-ph , u otro CERN> según si se ha publicado en una colección de publicaciones de CERN o hep-ph. ¿Como se puede influir en el motor de búsqueda para priorizar un código antes que otro?

En otras palabras, el motor de búsqueda responde por defecto a una pregunta similar a “ordena por el primer autor” u “ordena por el primer código de informe”, pero en ocasiones se desea instar al buscador para que ordene por el “primer código de informe que comience por el texto CERN-". Esta opción es posible a través del uso de un paràmetre de ordenación “invisible” denominado sp (para a "ordenar según un patrón") que permite ordenar siguiendo un patron de preferencia dado. Este parámetro es “invisible” desde el punto de vista de que no está presente en el interfaz de búsqueda, sino que debe ser añadido manualmente en el URL de la búsqueda. Por ejemplo, para recuperar todas las publicaciones del CERN-TH del año 2001 ordenados por su código, debe buscarse por CERN-TH-2001* dentro del índice código de informe, y en la página de resultados de búsqueda, cuando esté satisfecho con los resultados obtenidos, añadir &sp=CERN-TH en el URL a fin de ordenar los resultados sgún los códigos de informe CERN-TH, y conseguir una correcta ordenación del listado de todas las publicaciones CERN-TH del 2001.

Es possible seleccionar un camp determinat en funció del tipus d’ordenació dels resultats de la cerca, per exemple per ordenar els resultats pel títol principal. D’aquesta forma, en ocasions pot passar que es desitgi ordenar els documents pel codi de referència i que un mateix document en tingui més d’un. Per exemple, els codis hep-ph/0204140, CERN-TH-2002-069 i RM3-TH-02-4 apunten tots al mateix document. Si s’ordena el conjunt de resultats de cerca que conté el document, el sistema tindrà en consideració el primer número d’informe, que pot ser qualsevol dels tres. En ocasions es desitja classificar un document segons un determinat codi de referència hep-ph , o altre CERN> segons si ha estat publicat a una col•lecció de publicacions CERN o hep-ph. Com es pot influir en el motor de cerca per prioritzar un codi abans de l’altre?

En altres paraules, el motor de cerca respon per defecte a una pregunta semblant a “ordena pel primer autor” o “ordena pel primer codi d’informe”, però en ocasions es pot desitjar instar al cercador per tal que ordeni pel “primer codi d’informe que comenci pel text CERN-". Aquesta darrera opció és possible a través d’un paràmetre d’ordenació “invisible” denominat sp (per a "ordenar segons un patró") que permet ordenar segons un patró de preferència donat. Aquest paràmetre és “invisible” des del punt de vista que no és present a l’interfície de cerca, sinó que ha d’afegir-se manualment a la URL de cerca. Per exemple, per recuperar totes les publicacions del CERN-TH de l’any 2001 ordenats pel seu codi, cal cercar per CERN-TH-2001* dins l’índex de códi d’informe, i a la pàgina de resultats de cerca, quan s’estigui satisfet amb els resultats obtinguts, afegir &sp=CERN-TH a la URL per tal d’ordenar els resultats segons els codis d’informe CERN-TH, i aconseguir una correcta ordenació de la llista de totes les publicacions CERN-TH del 2001.

How to get documents from other servers (Google, SPIRES, KEK) Comment obtenir les documents d'autres serveurs (Google, SPIRES, KEK) Wie bekomme ich Dokumente anderer Server (Google, SPIRES, KEK) ¿Cómo obtener documentos de otros servidores? (Google, SPIRES, KEK) Com obtenir documents d'altres servidors? (Google, SPIRES, KEK)

On the search results page, links to other servers like Google, SPIRES or KEK are automatically proposed in a box entitled "Try your search on". You can simply click on the proposed links to run your query on these search engines.

Note that the links aren't printed if the search engine doesn't support it. For example, SPIRES or KEK cannot search for terms within "any field", so we don't link to them in these cases.

Sur la page des résultats, des liens vers d'autres serveurs tels que Google, SPIRES ou KEK sont automatiquement proposés dans une zone appelée "Essayez votre requête sur ...". Il suffit de cliquer sur ces liens pour obtenir les réponses de ces moteurs.

Notez que ces liens ne sont pas toujours imprimés pour tous les moteurs de recherche. Par exemple, SPIRES et KEK ne supportent pas la recherche par mot dans "tous les champs".

A la pàgina de resultats de cerca, s'ofereixen automàticament enllaços a altres servidors com Google, SPIRES o KEK sota el nom de "Proveu la vostra cerca a...". Amb un simple clic sobre els enllaços proposats es pot enviar la cerca realitzada en aquests cercadors.

Observis que els enllaços a altres cercadors no es mostren si el cercador no suporta la cerca. Per exemple, SPIRES o KEK no poden cercar termes dintre del camp "qualsevol camp", de forma que en aquests casos, el sistema no els enllaçarà.

En la página de resultadoss de búsqueda, se oferecen automáticament enlaces a otros servidores tales como Google, SPIRES o KEK bajo el nombre de "Intentar la búsqueda en...". Con un simple cic sobre los enlaces propuestos se puede enviar la búsqueda realizada en dichos buscadores.

Obsérvese que los enlaces a otros cercadores no se muestran si el buscador no soporta el tipo de búsqueda. Por ejemplo, SPIRES o KEK no pueden buscar términos dentro del campo "cualquier campo", de forma que en estos casos, el sistema no los enlazará.

How to search in fulltext files Comment rechercher le contenu des fichiers Wie kann ich in verknüpften Volltextdateien suchen ¿Cómo buscar en ficheros a texto completo? Com cercar a fitxers a text complet?

If a metadata record contains some associated fulltext files, tries to extract the textual information from the files and index it into a separate fulltext index. To search for all records that contain the term e- in their fulltext files, type:

Recall that fulltext words aren't included in the default global ``any field'' index, but that you may freely combine a fulltext and metadata search. For example, to find all articles written by Ellis that contain the word muon either in the metadata or in the fulltext, type:

Si une notice a des fichiers associés, tente d'en extraire les informations textuelles et de les indexer dans l'index "fulltext". Pour chercher toutes les notices qui contiennent le terme e- dans leur fichiers, saisissez:

Rappelez-vous que les mots contenus dans les fichiers ne sont pas inclus dans l'index global ``tous les champs'', mais qu'il vous est possible de combiner librement la recherche dans le fichier avec la recherche dans les métadonnées. Par exemple, pour trouver tous les articles écrits par Ellis qui contiennent le mot muon dans les métadonnées et/ou dans le texte complet, saisissez:

Si un registre de metadades té arxius a text complet associats tractarà de extraure la informació textual dels arxius e indexar-la a un índex de text completseparat. Per recuperar tots els documents que contenen el terme e-al text complet dels seus documents associats, cal escriure:

Recordem que les paraules del text complet no s'inclouen a l'índex global 'qualsevol camp' predeterminat, però és possible realitzar una combinació lliure de text complet i metadades dins una mateixa cerca. Per exemple, per recuperar tots els articles escrits per Ellis que continguin la paraula muon tant a les metadades como all text complet, cal escriure:

Si un registro de metadatos contiene archivos a texto completo asociados, tratará de extraer la información textual de los archivos e indexarla en un índice de texto completoseparado. Para recuperar todos los documentos que contienen el término e- en el texto completo de sus documentos asociados, debemos escribir:

Recordamos que las palabras del texto completo no se incluyen en el índice global 'cualquier campo' predeterminado, pero es posible realizar una combinación libre de texto completo y metadatos en una misma búsqueda. Por ejemplo, para recuperar todos los artículos escritos por Ellis que contengan la palabra muon tanto en los metadatos como en el texto completo, debemos escribir:

How to search for citations Comment rechercher les citations Wie kann ich nach Zitaten suchen ¿Como buscar citas bibliográficas? Cómo cercar referències bibliogàfiques

If a metadata record contains an associated fulltext file, tries to extract references automatically from that file and index them into a separate reference index. To search for all records that cite Ellis in their reference lists, type:

To search for all records that cite preprint hep-ph/0103062 in their reference lists, type:
To search for all records that cite an article from Giddings and Ross published in Physical Review D in volume 61 in year 2000, type:
Recall that citation terms aren't included in the default global "any field" index, but that you may freely combine a citation search with a metadata search. For example, to find all articles on standard model that aren't written by Ellis but that do cite him, type:

Si les métadonnées d'une notice contiennent un texte complet associé, tente d'en extraire automatiquement les références et les indexe dans l'index references. Pour chercher tous les documents qui citent Ellis, saisissez:

Pour chercher toutes les notices qui citent le preprint hep-ph/0103062, saisissez:
Pour chercher toutes les notices qui citent un article de Giddings et Ross publié dans Physical Review D, volume 61 en 2000, saisissez:
Rappelez-vous que les citations ne sont pas incluses dans l'index global ``tous les champs'', mais qu'il vous est possible de combiner librement la recherche des citations avec la recherche dans les métadonnées. Par exemple, pour trouver tous les articles sur le standard model qui n'ont pas été écrits par Ellis mais qui le citent, saisissez:

Si un registro de metadatos contiene un fichero de texto asociado, trata de extraer las citas automaticamente del fichero e indexarlas en un índice separado de citas. Para recuperar todos los registros que citan a Ellis en sus listados de citas bibliogáficas, escribiremos:

Para recuperar todos los registros que citan el pre-print hep-ph/0103062en sus listados de citas bibliográficas, escribiremos:
Para recuperar todos los documentos que citan un artículo de Giddings y Rosspublicado en Physical Review D volumen 61 y en el año 2000, escribiremos:
Recuerde que los términos de las citaciones no estan incluidos dentro del índice “cualquier campo” en la búsqueda global predeterminada, pero puede realizar una combinación libre entre búsqueda por cita bibliogràfica y por metadatos. Por ejemplo, para recuperar todos los artículos sobre modelo estándar que no han sido escritos por Ellis pero lo citan, escribiremos:

Si un registre de metadades conté un fitxer de text associat, tracta de extraure les referències automàticament del fitxer e indexar-les a un índex separat de referències. Para recuperar tots els registres que citen a Ellis en els seus llistats de referències bibliogràfiques, escriurem:

Per recuperar tots els registres que citen el pre-print hep-ph/0103062en els seus llistats de referències bibliogràfiques, escriurem:
Per recuperar tots els documents que citen un article de Giddings i Rosspublicat a Physical Review D volum 61 i a l’any 2000, escriurem:
Recordi que els termes de les referències no estan inclosos dins l’índex “qualsevol camp” a la cerca global predeterminada, però pot realitzar una combinació lliure entre cerca por referència i per metadades. Per exemple, per recuperar tots els articles sobre model estàndar que no han estat escrits per Ellis però el citen, escriurem:
diff --git a/modules/websearch/doc/search-tips.webdoc b/modules/websearch/doc/search-tips.webdoc index e9dbe31b6..88679a901 100644 --- a/modules/websearch/doc/search-tips.webdoc +++ b/modules/websearch/doc/search-tips.webdoc @@ -1,599 +1,667 @@ ## -*- mode: html; coding: utf-8; -*- ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
How to find any value in any field: Comment chercher n'importe quel mot dans n'importe quel champ: Come cercare dei termini in qualsiasi campo:
Empty search box returns all records in the database. Une recherche vide retourne toutes les entrées de la base de donnée. Non specificando alcun termine per la ricerca vengono riportati tutti i record del database.
How to find documents in a particular collection: Comment trouver un document classé dans une collection spécifique: Come trovare documenti appartenenti ad una particolare collezione:

_(Narrow by collection:)_
 Preprints
 Theses

Click on a link below the search box to see and select subcollections such as Preprints or Theses, or select/deselect the tick boxes next to a particular collection before doing the search. Avant de lancer votre recherche, cliquez sur un des liens de l'arborescence située en dessous de la boite de recherche pour afficher une sous-collection telle que "Preprints" ou "Thèses", ou sélectionner la case à cocher située à gauche d'une collection ou du type de document souhaité. Fai clic su uno dei collegamenti al di sotto della casella di ricerca e seleziona delle sotto-collezioni come Preprints o Thesis o seleziona/deseleziona le caselle di selezione accanto alle collezioni desiderate prima di effettuare la ricerca.

_(Narrow by collection:)_
 Preprints
 Theses

If you want documents from a 'collection' not proposed by default, such as NA60 documents, then enter it as a search term. Pour les documents appartenant à une collection qui n'est pas proposée par défaut, tels que les documents NA60, entrez le nom de la collection dans la requête. Se desideri dei documenti appartenenti a una 'collezione' non proposta in maniera predefinita, come i documenti NA60, inseriscila come un termine della ricerca.
more on how to wisely choose your search terms... plus d'informations sur le choix des termes de recherche... maggiori informazioni riguardo a come scegliere in maniera appropriata i termini di rcerca...
How to search for words/phrases (within titles, abstracts, etc): Comment chercher un ou plusieurs mots, une phrase (dans le titre, l'abstract, etc.): Come effettuare una ricerca di parole/frasi (all'interno di titoli, abstract, ecc.)
- Returns records containing words higgs and + Return records containing words higgs and boson. Retourne les notices contenant les mots higgs et boson. Restituisce i record contenenti le parole higgs e boson.
- Returns records containing phrase 'higgs boson' in + Return records containing phrase 'higgs boson' in title. Retourne les notices contenant la phrase 'higgs boson' dans le titre. Restituisce i record contenenti la frase 'higgs boson' nel titolo.
- Returns records entitled exactly "Higgs boson" but not + Return records entitled exactly "Higgs boson" but not records such as "Overview of Higgs boson production". Retourne les notices dont le titre est exactement "Higgs boson", mais pas les notices telles que "Overview of Higgs boson production" Restituisce i record il cui titolo è esattamente "Higgs boson" ma non i record quali "Overview of Higgs boson production".
more on word and phrase searches... plus d'informations sur la recherche de mots ou de phrases... maggiori informazioni sulla ricerca per parole e frasi...
How to use truncation: Comment utiliser la troncature: Come utilizzare la troncatura
- Returns records containing words muon, muons, + Return records containing words muon, muons, muonic, etc. Retourne les notices contenant les mots muon, muons, muonic, etc. Restituisce i record contenenti le parole muon, muons, muonic, ecc.
more on truncation... plus d'informations au sujet de la troncature ... maggiori informazioni sulla troncatura...
How to use boolean operators: Comment utiliser les opérateurs booléens: Come utilizzare gli operatori booleani:
_(or)_:
- Returns records containing both muon and kaon. + Return records containing both muon and kaon. Retourne les notices contenant à la fois muon et kaon. Restituisce i record contenenti sia muon che kaon.
- Returns records containing either muon or kaon. + Return records containing either muon or kaon. Retourne les notices contenant soit muon, soit kaon. Restituisce i record contenenti muon e/o kaon.
- Returns records containing muon but not kaon. + Return records containing muon but not kaon. Retourne les notices contenant muon, mais pas kaon. Restituisce i record contenenti muon ma non kaon.
more on Boolean queries... plus d'informations au sujet des requêtes booléennes... maggiori informazioni sulle richieste di ricerca booleane...
How to use parentheses:
- Returns records containing either gravity or supergravity, + Return records containing either gravity or supergravity, and ellis or perelstein. Nested parens are not supported.
more on parentheses...
How to find documents from a certain period: Comment chercher un document d'une certaine période: Come cercare i documenti a partire da un certo periodo:
Type 2003 and select the _(year)_ field. Saisissez 2003 and sélectionnez le champ _(year)_. Digita 2003 e seleziona il campo _(year)_.
Alternatively, type '_(year)_:' followed by value. Alternativement, saisissez '_(year)_:' suivi de l'année. In alternativa, digita '_(year)_:' seguito dall'anno.
You can enter specific year range. Vous pouvez également définir un intervalle temporel. Puoi inserire specifici intervalli di anni.
more on span queries... plus d'informations au sujet des requêtes par périodes... maggiori informazioni sulle ricerche tramite intervalli di tempo...
+
+How to search inside citation network: +
+ + + + + + + + + + + + + + +
+
+ + + + +
+
+ + Return papers that are cited between 3 and 30 times. + +
+
+ + + + +
+
+ + Return all papers that cite any of the papers written + by ellis. Useful for citation alerts. + +
+
+ + + + +
+
+ + Return all papers that are cited by any of the papers written + by ellis. + +
+
+ + more on refersto/citedby operators... + +
+
How to search for authors: Comment chercher un document par auteur: Come effettuare ricerche su autori:
Some authors have unique names and their publications can be retrieved by searching for the surname in the _(any field)_ field. Les noms uniques peuvent être recherchés dans le champ _(any field)_. Alcuni autori hanno dei nomi univoci e le loro pubblicazioni possono essere recuperate ricercando il cognome nel campo _(any field)_.
For better results, type surname comma initial(s) and select the _(author)_ field. Pour de meilleurs résultats, inscrire le nom accompagné d'une virgule et de l'initiale du prénom, puis sélectionner le champ de recherche _(author)_. Per migliorare i risultati della ricerca, digita il cognome seguito dalla virgola, seguita dalla(e) iniziale(i) e seleziona il campo _(author)_.
Sometimes authors are indexed with their full name. Parfois, les auteurs peuvent être inscrits avec leur prénom complet. A volte gli autori sono indicizzati tramite il loro nome completo.
Find papers by J.Ellis written from 1990 until 1993, using truncation to match all first names beginning with J. Rechercher toutes les notices de J.Ellis publiées entre 1990 et 1993, en utilisant la troncature permettant de retrouver tous les prénoms commençant par J. Come ricercare i tutti gli articoli di J.Ellis scritti tra il 1990 e il 1993, utilizzando la troncatura per ricuperare tutti i nomi di battesimo che cominciano con J.
more on author searches... plus d'informations au sujet de la recherche par auteur... maggiori informazioni sulla ricerca tramite nome dell'autore...
More information: Plus d'informations: Ulteriori informazioni:
Special characters, regular expressions, fulltext searching, citation searching, and other capabilities are fully explained in the complete Search Guide. Des explications additionnelles concernant les caractères spéciaux, les expressions régulières, la recherche dans le contenu des fichiers, des citations, ainsi que d'autres fonctionnalités sont disponibles dans le Guide de Recherche. Nella Guida di Ricerca completa sono spiegate in maniera estesa ulteriori funzionalità quali caratteri speciali, espressioni regolari, ricerca nel testo integrale, ricerca tramite citazioni. diff --git a/modules/websearch/lib/search_engine.py b/modules/websearch/lib/search_engine.py index a7a366b4a..64f9fdf24 100644 --- a/modules/websearch/lib/search_engine.py +++ b/modules/websearch/lib/search_engine.py @@ -1,5113 +1,5153 @@ # -*- coding: utf-8 -*- ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable=C0301 """CDS Invenio Search Engine in mod_python.""" __lastupdated__ = """$Date$""" __revision__ = "$Id$" ## import general modules: import cgi import cStringIO import copy import string import os import re import time import urllib import urlparse import zlib import sys if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 ## import CDS Invenio stuff: from invenio.config import \ CFG_CERN_SITE, \ CFG_INSPIRE_SITE, \ CFG_OAI_ID_FIELD, \ CFG_WEBCOMMENT_ALLOW_REVIEWS, \ CFG_WEBSEARCH_CALL_BIBFORMAT, \ CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX, \ CFG_WEBSEARCH_FIELDS_CONVERT, \ CFG_WEBSEARCH_NB_RECORDS_TO_SORT, \ CFG_WEBSEARCH_SEARCH_CACHE_SIZE, \ CFG_WEBSEARCH_USE_JSMATH_FOR_FORMATS, \ CFG_WEBSEARCH_USE_ALEPH_SYSNOS, \ CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS, \ CFG_WEBSEARCH_FULLTEXT_SNIPPETS, \ CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE, \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, \ CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS, \ CFG_SITE_LANG, \ CFG_SITE_NAME, \ CFG_LOGDIR, \ CFG_BIBFORMAT_HIDDEN_TAGS, \ CFG_SITE_URL, \ CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS from invenio.search_engine_config import InvenioWebSearchUnknownCollectionError from invenio.bibrecord import create_record, record_get_field_instances from invenio.bibrank_record_sorter import get_bibrank_methods, rank_records, is_method_valid from invenio.bibrank_downloads_similarity import register_page_view_event, calculate_reading_similarity_list from invenio.bibindex_engine_stemmer import stem from invenio.bibindex_engine_tokenizer import wash_author_name, author_name_requires_phrase_search from invenio.bibformat import format_record, format_records, get_output_format_content_type, create_excel from invenio.bibformat_config import CFG_BIBFORMAT_USE_OLD_BIBFORMAT from invenio.bibrank_downloads_grapher import create_download_history_graph_and_box from invenio.data_cacher import DataCacher from invenio.websearch_external_collections import print_external_results_overview, perform_external_collection_search from invenio.access_control_admin import acc_get_action_id from invenio.access_control_config import VIEWRESTRCOLL, \ CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS from invenio.websearchadminlib import get_detailed_page_tabs from invenio.intbitset import intbitset as HitSet from invenio.dbquery import DatabaseError, deserialize_via_marshal from invenio.access_control_engine import acc_authorize_action from invenio.errorlib import register_exception from invenio.textutils import encode_for_xml import invenio.template webstyle_templates = invenio.template.load('webstyle') webcomment_templates = invenio.template.load('webcomment') from invenio.bibrank_citation_searcher import get_cited_by_count, calculate_cited_by_list, \ - calculate_co_cited_with_list, get_records_with_num_cites, get_self_cited_by + calculate_co_cited_with_list, get_records_with_num_cites, get_self_cited_by, \ + get_refersto_hitset, get_citedby_hitset from invenio.bibrank_citation_grapher import create_citation_history_graph_and_box from invenio.dbquery import run_sql, run_sql_cached, get_table_update_time from invenio.webuser import getUid, collect_user_info from invenio.webpage import pageheaderonly, pagefooteronly, create_error_box from invenio.messages import gettext_set_language from invenio.search_engine_query_parser import SearchQueryParenthesisedParser, \ InvenioWebSearchMismatchedParensError, SpiresToInvenioSyntaxConverter from invenio import webinterface_handler_config as apache try: import invenio.template websearch_templates = invenio.template.load('websearch') except: pass from invenio.websearch_external_collections import calculate_hosted_collections_results, do_calculate_hosted_collections_results from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_ANTE_SEARCH from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_POST_SEARCH from invenio.websearch_external_collections_config import CFG_EXTERNAL_COLLECTION_MAXRESULTS VIEWRESTRCOLL_ID = acc_get_action_id(VIEWRESTRCOLL) ## global vars: cfg_nb_browse_seen_records = 100 # limit of the number of records to check when browsing certain collection cfg_nicely_ordered_collection_list = 0 # do we propose collection list nicely ordered or alphabetical? ## precompile some often-used regexp for speed reasons: re_word = re.compile('[\s]') re_quotes = re.compile('[\'\"]') re_doublequote = re.compile('\"') re_equal = re.compile('\=') re_logical_and = re.compile('\sand\s', re.I) re_logical_or = re.compile('\sor\s', re.I) re_logical_not = re.compile('\snot\s', re.I) re_operators = re.compile(r'\s([\+\-\|])\s') re_pattern_wildcards_after_spaces = re.compile(r'(\s)[\*\%]+') re_pattern_single_quotes = re.compile("'(.*?)'") re_pattern_double_quotes = re.compile("\"(.*?)\"") re_pattern_regexp_quotes = re.compile("\/(.*?)\/") re_pattern_short_words = re.compile(r'([\s\"]\w{1,3})[\*\%]+') re_pattern_space = re.compile("__SPACE__") re_pattern_today = re.compile("\$TODAY\$") re_pattern_parens = re.compile(r'\([^\)]+\s+[^\)]+\)') re_unicode_lowercase_a = re.compile(unicode(r"(?u)[áàäâãå]", "utf-8")) re_unicode_lowercase_ae = re.compile(unicode(r"(?u)[æ]", "utf-8")) re_unicode_lowercase_e = re.compile(unicode(r"(?u)[éèëê]", "utf-8")) re_unicode_lowercase_i = re.compile(unicode(r"(?u)[íìïî]", "utf-8")) re_unicode_lowercase_o = re.compile(unicode(r"(?u)[óòöôõø]", "utf-8")) re_unicode_lowercase_u = re.compile(unicode(r"(?u)[úùüû]", "utf-8")) re_unicode_lowercase_y = re.compile(unicode(r"(?u)[ýÿ]", "utf-8")) re_unicode_lowercase_c = re.compile(unicode(r"(?u)[çć]", "utf-8")) re_unicode_lowercase_n = re.compile(unicode(r"(?u)[ñ]", "utf-8")) re_unicode_uppercase_a = re.compile(unicode(r"(?u)[ÁÀÄÂÃÅ]", "utf-8")) re_unicode_uppercase_ae = re.compile(unicode(r"(?u)[Æ]", "utf-8")) re_unicode_uppercase_e = re.compile(unicode(r"(?u)[ÉÈËÊ]", "utf-8")) re_unicode_uppercase_i = re.compile(unicode(r"(?u)[ÍÌÏÎ]", "utf-8")) re_unicode_uppercase_o = re.compile(unicode(r"(?u)[ÓÒÖÔÕØ]", "utf-8")) re_unicode_uppercase_u = re.compile(unicode(r"(?u)[ÚÙÜÛ]", "utf-8")) re_unicode_uppercase_y = re.compile(unicode(r"(?u)[Ý]", "utf-8")) re_unicode_uppercase_c = re.compile(unicode(r"(?u)[ÇĆ]", "utf-8")) re_unicode_uppercase_n = re.compile(unicode(r"(?u)[Ñ]", "utf-8")) re_latex_lowercase_a = re.compile("\\\\[\"H'`~^vu=k]\{?a\}?") re_latex_lowercase_ae = re.compile("\\\\ae\\{\\}?") re_latex_lowercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?e\\}?") re_latex_lowercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?i\\}?") re_latex_lowercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?o\\}?") re_latex_lowercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?u\\}?") re_latex_lowercase_y = re.compile("\\\\[\"']\\{?y\\}?") re_latex_lowercase_c = re.compile("\\\\['uc]\\{?c\\}?") re_latex_lowercase_n = re.compile("\\\\[c'~^vu]\\{?n\\}?") re_latex_uppercase_a = re.compile("\\\\[\"H'`~^vu=k]\\{?A\\}?") re_latex_uppercase_ae = re.compile("\\\\AE\\{?\\}?") re_latex_uppercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?E\\}?") re_latex_uppercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?I\\}?") re_latex_uppercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?O\\}?") re_latex_uppercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?U\\}?") re_latex_uppercase_y = re.compile("\\\\[\"']\\{?Y\\}?") re_latex_uppercase_c = re.compile("\\\\['uc]\\{?C\\}?") re_latex_uppercase_n = re.compile("\\\\[c'~^vu]\\{?N\\}?") class RestrictedCollectionDataCacher(DataCacher): def __init__(self): def cache_filler(): ret = [] try: res = run_sql("""SELECT DISTINCT ar.value FROM accROLE_accACTION_accARGUMENT raa JOIN accARGUMENT ar ON raa.id_accARGUMENT = ar.id WHERE ar.keyword = 'collection' AND raa.id_accACTION = %s""", (VIEWRESTRCOLL_ID,)) except Exception: # database problems, return empty cache return [] for coll in res: ret.append(coll[0]) return ret def timestamp_verifier(): return max(get_table_update_time('accROLE_accACTION_accARGUMENT'), get_table_update_time('accARGUMENT')) DataCacher.__init__(self, cache_filler, timestamp_verifier) def collection_restricted_p(collection): restricted_collection_cache.recreate_cache_if_needed() return collection in restricted_collection_cache.cache try: restricted_collection_cache.is_ok_p except Exception: restricted_collection_cache = RestrictedCollectionDataCacher() def get_permitted_restricted_collections(user_info): """Return a list of collection that are restricted but for which the user is authorized.""" restricted_collection_cache.recreate_cache_if_needed() ret = [] for collection in restricted_collection_cache.cache: if acc_authorize_action(user_info, 'viewrestrcoll', collection=collection)[0] == 0: ret.append(collection) return ret def get_restricted_collections_for_recid(recid): """ Return the list of restricted collection names to which recid belongs. """ restricted_collections = run_sql("""SELECT c.name, c.reclist FROM accROLE_accACTION_accARGUMENT raa JOIN accARGUMENT ar ON raa.id_accARGUMENT = ar.id JOIN collection c ON ar.value=c.name WHERE ar.keyword = 'collection' AND raa.id_accACTION = %s""", (VIEWRESTRCOLL_ID,)) return [row[0] for row in restricted_collections if recid in HitSet(row[1])] def is_user_owner_of_record(user_info, recid): """ Check if the user is owner of the record, i.e. he is the submitter and/or belongs to a owner-like group authorized to 'see' the record. @param user_info: the user_info dictionary that describe the user. @type user_info: user_info dictionary @param recid: the record identifier. @type recid: positive integer @return: True if the user is 'owner' of the record; False otherwise @rtype: bool """ authorized_emails_or_group = [] for tag in CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS: authorized_emails_or_group.extend(get_fieldvalues(recid, tag)) for email_or_group in authorized_emails_or_group: if email_or_group in user_info['group']: return True email = email_or_group.strip().lower() if user_info['email'].strip().lower() == email: return True return False def check_user_can_view_record(user_info, recid): """ Check if the user is authorized to view the given recid. The function grants access in two cases: either user has author rights on this record, or he has view rights to the primary collection this record belongs to. @param user_info: the user_info dictionary that describe the user. @type user_info: user_info dictionary @param recid: the record identifier. @type recid: positive integer @return: (0, ''), when authorization is granted, (>0, 'message') when authorization is not granted @rtype: (int, string) """ restricted_collections = get_restricted_collections_for_recid(recid) if not restricted_collections or is_user_owner_of_record(user_info, recid): return (0, '') for collection in restricted_collections: (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=collection) if auth_code == 0: continue else: return (auth_code, auth_msg) return (0, '') class IndexStemmingDataCacher(DataCacher): """ Provides cache for stemming information for word/phrase indexes. This class is not to be used directly; use function get_index_stemming_language() instead. """ def __init__(self): def cache_filler(): try: res = run_sql("""SELECT id, stemming_language FROM idxINDEX""") except DatabaseError: # database problems, return empty cache return {} return dict(res) def timestamp_verifier(): return get_table_update_time('idxINDEX') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: index_stemming_cache.is_ok_p except Exception: index_stemming_cache = IndexStemmingDataCacher() def get_index_stemming_language(index_id): """Return stemming langugage for given index.""" index_stemming_cache.recreate_cache_if_needed() return index_stemming_cache.cache[index_id] class CollectionRecListDataCacher(DataCacher): """ Provides cache for collection reclist hitsets. This class is not to be used directly; use function get_collection_reclist() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT name,reclist FROM collection") except Exception: # database problems, return empty cache return {} for name, reclist in res: ret[name] = None # this will be filled later during runtime by calling get_collection_reclist(coll) return ret def timestamp_verifier(): return get_table_update_time('collection') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not collection_reclist_cache.is_ok_p: raise Exception except Exception: collection_reclist_cache = CollectionRecListDataCacher() def get_collection_reclist(coll): """Return hitset of recIDs that belong to the collection 'coll'.""" collection_reclist_cache.recreate_cache_if_needed() if not collection_reclist_cache.cache[coll]: # not yet it the cache, so calculate it and fill the cache: set = HitSet() query = "SELECT nbrecs,reclist FROM collection WHERE name=%s" res = run_sql(query, (coll, ), 1) if res: try: set = HitSet(res[0][1]) except: pass collection_reclist_cache.cache[coll] = set # finally, return reclist: return collection_reclist_cache.cache[coll] class SearchResultsCache(DataCacher): """ Provides temporary lazy cache for Search Results. Useful when users click on `next page'. """ def __init__(self): def cache_filler(): return {} def timestamp_verifier(): return '1970-01-01 00:00:00' # lazy cache is always okay; # its filling is governed by # CFG_WEBSEARCH_SEARCH_CACHE_SIZE DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not search_results_cache.is_ok_p: raise Exception except Exception: search_results_cache = SearchResultsCache() class CollectionI18nNameDataCacher(DataCacher): """ Provides cache for I18N collection names. This class is not to be used directly; use function get_coll_i18nname() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT c.name,cn.ln,cn.value FROM collectionname AS cn, collection AS c WHERE cn.id_collection=c.id AND cn.type='ln'") # ln=long name except Exception: # database problems return {} for c, ln, i18nname in res: if i18nname: if not ret.has_key(c): ret[c] = {} ret[c][ln] = i18nname return ret def timestamp_verifier(): return get_table_update_time('collectionname') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not collection_i18nname_cache.is_ok_p: raise Exception except Exception: collection_i18nname_cache = CollectionI18nNameDataCacher() def get_coll_i18nname(c, ln=CFG_SITE_LANG, verify_cache_timestamp=True): """ Return nicely formatted collection name (of the name type `ln' (=long name)) for collection C in language LN. This function uses collection_i18nname_cache, but it verifies whether the cache is up-to-date first by default. This verification step is performed by checking the DB table update time. So, if you call this function 1000 times, it can get very slow because it will do 1000 table update time verifications, even though collection names change not that often. Hence the parameter VERIFY_CACHE_TIMESTAMP which, when set to False, will assume the cache is already up-to-date. This is useful namely in the generation of collection lists for the search results page. """ if verify_cache_timestamp: collection_i18nname_cache.recreate_cache_if_needed() out = c try: out = collection_i18nname_cache.cache[c][ln] except KeyError: pass # translation in LN does not exist return out class FieldI18nNameDataCacher(DataCacher): """ Provides cache for I18N field names. This class is not to be used directly; use function get_field_i18nname() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT f.name,fn.ln,fn.value FROM fieldname AS fn, field AS f WHERE fn.id_field=f.id AND fn.type='ln'") # ln=long name except Exception: # database problems, return empty cache return {} for f, ln, i18nname in res: if i18nname: if not ret.has_key(f): ret[f] = {} ret[f][ln] = i18nname return ret def timestamp_verifier(): return get_table_update_time('fieldname') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not field_i18nname_cache.is_ok_p: raise Exception except Exception: field_i18nname_cache = FieldI18nNameDataCacher() def get_field_i18nname(f, ln=CFG_SITE_LANG, verify_cache_timestamp=True): """ Return nicely formatted field name (of type 'ln', 'long name') for field F in language LN. If VERIFY_CACHE_TIMESTAMP is set to True, then verify DB timestamp and field I18N name cache timestamp and refresh cache from the DB if needed. Otherwise don't bother checking DB timestamp and return the cached value. (This is useful when get_field_i18nname is called inside a loop.) """ if verify_cache_timestamp: field_i18nname_cache.recreate_cache_if_needed() out = f try: out = field_i18nname_cache.cache[f][ln] except KeyError: pass # translation in LN does not exist return out def get_alphabetically_ordered_collection_list(level=0, ln=CFG_SITE_LANG): """Returns nicely ordered (score respected) list of collections, more exactly list of tuples (collection name, printable collection name). Suitable for create_search_box().""" out = [] res = run_sql_cached("SELECT id,name FROM collection ORDER BY name ASC", affected_tables=['collection',]) for c_id, c_name in res: # make a nice printable name (e.g. truncate c_printable for # long collection names in given language): c_printable_fullname = get_coll_i18nname(c_name, ln, False) c_printable = wash_index_term(c_printable_fullname, 30, False) if c_printable != c_printable_fullname: c_printable = c_printable + "..." if level: c_printable = " " + level * '-' + " " + c_printable out.append([c_name, c_printable]) return out def get_nicely_ordered_collection_list(collid=1, level=0, ln=CFG_SITE_LANG): """Returns nicely ordered (score respected) list of collections, more exactly list of tuples (collection name, printable collection name). Suitable for create_search_box().""" colls_nicely_ordered = [] res = run_sql("""SELECT c.name,cc.id_son FROM collection_collection AS cc, collection AS c WHERE c.id=cc.id_son AND cc.id_dad=%s ORDER BY score DESC""", (collid, )) for c, cid in res: # make a nice printable name (e.g. truncate c_printable for # long collection names in given language): c_printable_fullname = get_coll_i18nname(c, ln, False) c_printable = wash_index_term(c_printable_fullname, 30, False) if c_printable != c_printable_fullname: c_printable = c_printable + "..." if level: c_printable = " " + level * '-' + " " + c_printable colls_nicely_ordered.append([c, c_printable]) colls_nicely_ordered = colls_nicely_ordered + get_nicely_ordered_collection_list(cid, level+1, ln=ln) return colls_nicely_ordered def get_index_id_from_field(field): """ Return index id with name corresponding to FIELD, or the first index id where the logical field code named FIELD is indexed. Return zero in case there is no index defined for this field. Example: field='author', output=4. """ out = 0 if field == '': field = 'global' # empty string field means 'global' index (field 'anyfield') # first look in the index table: res = run_sql("""SELECT id FROM idxINDEX WHERE name=%s""", (field,)) if res: out = res[0][0] return out # not found in the index table, now look in the field table: res = run_sql("""SELECT w.id FROM idxINDEX AS w, idxINDEX_field AS wf, field AS f WHERE f.code=%s AND wf.id_field=f.id AND w.id=wf.id_idxINDEX LIMIT 1""", (field,)) if res: out = res[0][0] return out def get_words_from_pattern(pattern): "Returns list of whitespace-separated words from pattern." words = {} for word in string.split(pattern): if not words.has_key(word): words[word] = 1 return words.keys() def create_basic_search_units(req, p, f, m=None, of='hb'): """Splits search pattern and search field into a list of independently searchable units. - A search unit consists of '(operator, pattern, field, type, hitset)' tuples where 'operator' is set union (|), set intersection (+) or set exclusion (-); 'pattern' is either a word (e.g. muon*) or a phrase (e.g. 'nuclear physics'); 'field' is either a code like 'title' or MARC tag like '100__a'; 'type' is the search type ('w' for word file search, 'a' for access file search). - Optionally, the function accepts the match type argument 'm'. If it is set (e.g. from advanced search interface), then it performs this kind of matching. If it is not set, then a guess is made. 'm' can have values: 'a'='all of the words', 'o'='any of the words', 'p'='phrase/substring', 'r'='regular expression', 'e'='exact value'. - Warnings are printed on req (when not None) in case of HTML output formats.""" opfts = [] # will hold (o,p,f,t,h) units # FIXME: quick hack for the journal index if f == 'journal': opfts.append(['+', p, f, 'w']) return opfts ## check arguments: is desired matching type set? if m: ## A - matching type is known; good! if m == 'e': # A1 - exact value: opfts.append(['+', p, f, 'a']) # '+' since we have only one unit elif m == 'p': # A2 - phrase/substring: opfts.append(['+', "%" + p + "%", f, 'a']) # '+' since we have only one unit elif m == 'r': # A3 - regular expression: opfts.append(['+', p, f, 'r']) # '+' since we have only one unit elif m == 'a' or m == 'w': # A4 - all of the words: p = strip_accents(p) # strip accents for 'w' mode, FIXME: delete when not needed for word in get_words_from_pattern(p): opfts.append(['+', word, f, 'w']) # '+' in all units elif m == 'o': # A5 - any of the words: p = strip_accents(p) # strip accents for 'w' mode, FIXME: delete when not needed for word in get_words_from_pattern(p): if len(opfts)==0: opfts.append(['+', word, f, 'w']) # '+' in the first unit else: opfts.append(['|', word, f, 'w']) # '|' in further units else: if of.startswith("h"): print_warning(req, "Matching type '%s' is not implemented yet." % cgi.escape(m), "Warning") opfts.append(['+', "%" + p + "%", f, 'w']) else: ## B - matching type is not known: let us try to determine it by some heuristics if f and p[0] == '"' and p[-1] == '"': ## B0 - does 'p' start and end by double quote, and is 'f' defined? => doing ACC search opfts.append(['+', p[1:-1], f, 'a']) elif (f == 'author' or f == 'exactauthor') and author_name_requires_phrase_search(p): ## B1 - do we search in author, and does 'p' contain space/comma/dot/etc? ## => doing washed ACC search opfts.append(['+', p, f, 'a']) elif f and p[0] == "'" and p[-1] == "'": ## B0bis - does 'p' start and end by single quote, and is 'f' defined? => doing ACC search opfts.append(['+', '%' + p[1:-1] + '%', f, 'a']) elif f and p[0] == "/" and p[-1] == "/": ## B0ter - does 'p' start and end by a slash, and is 'f' defined? => doing regexp search opfts.append(['+', p[1:-1], f, 'r']) elif f and string.find(p, ',') >= 0: ## B1 - does 'p' contain comma, and is 'f' defined? => doing ACC search opfts.append(['+', p, f, 'a']) elif f and str(f[0:2]).isdigit(): ## B2 - does 'f' exist and starts by two digits? => doing ACC search opfts.append(['+', p, f, 'a']) else: ## B3 - doing WRD search, but maybe ACC too # search units are separated by spaces unless the space is within single or double quotes # so, let us replace temporarily any space within quotes by '__SPACE__' p = re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", p) p = re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", p) p = re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", p) # wash argument: p = re_equal.sub(":", p) p = re_logical_and.sub(" ", p) p = re_logical_or.sub(" |", p) p = re_logical_not.sub(" -", p) p = re_operators.sub(r' \1', p) for pi in string.split(p): # iterate through separated units (or items, as "pi" stands for "p item") pi = re_pattern_space.sub(" ", pi) # replace back '__SPACE__' by ' ' # firstly, determine set operator if pi[0] == '+' or pi[0] == '-' or pi[0] == '|': oi = pi[0] pi = pi[1:] else: # okay, there is no operator, so let us decide what to do by default oi = '+' # by default we are doing set intersection... # secondly, determine search pattern and field: if string.find(pi, ":") > 0: fi, pi = string.split(pi, ":", 1) # test whether fi is a real index code or a MARC-tag defined code: if fi in get_fieldcodes() or '00' <= fi[:2] <= '99': pass else: # it is not, so join it back: fi, pi = f, fi + ":" + pi else: fi, pi = f, pi # look also for old ALEPH field names: if fi and CFG_WEBSEARCH_FIELDS_CONVERT.has_key(string.lower(fi)): fi = CFG_WEBSEARCH_FIELDS_CONVERT[string.lower(fi)] # wash 'pi' argument: if re_quotes.match(pi): # B3a - quotes are found => do ACC search (phrase search) if pi[0] == '"' and pi[-1] == '"': pi = string.replace(pi, '"', '') # remove quote signs opfts.append([oi, pi, fi, 'a']) elif pi[0] == "'" and pi[-1] == "'": pi = string.replace(pi, "'", "") # remove quote signs opfts.append([oi, "%" + pi + "%", fi, 'a']) else: # unbalanced quotes, so fall back to WRD query: opfts.append([oi, pi, fi, 'w']) elif fi and str(fi[0]).isdigit() and str(fi[0]).isdigit(): # B3b - fi exists and starts by two digits => do ACC search opfts.append([oi, pi, fi, 'a']) elif fi and not get_index_id_from_field(fi) and get_field_name(fi): # B3c - logical field fi exists but there is no WRD index for fi => try ACC search opfts.append([oi, pi, fi, 'a']) elif pi.startswith('/') and pi.endswith('/'): # B3d - pi has slashes around => do regexp search opfts.append([oi, pi[1:-1], fi, 'r']) else: # B3e - general case => do WRD search pi = strip_accents(pi) # strip accents for 'w' mode, FIXME: delete when not needed for pii in get_words_from_pattern(pi): opfts.append([oi, pii, fi, 'w']) ## sanity check: for i in range(0, len(opfts)): try: pi = opfts[i][1] if pi == '*': if of.startswith("h"): print_warning(req, "Ignoring standalone wildcard word.", "Warning") del opfts[i] if pi == '' or pi == ' ': fi = opfts[i][2] if fi: if of.startswith("h"): print_warning(req, "Ignoring empty %s search term." % fi, "Warning") del opfts[i] except: pass ## return search units: return opfts def page_start(req, of, cc, aas, ln, uid, title_message=None, description='', keywords='', recID=-1, tab='', p=''): "Start page according to given output format." _ = gettext_set_language(ln) if not req or isinstance(req, cStringIO.OutputType): return # we were called from CLI if not title_message: title_message = _("Search Results") content_type = get_output_format_content_type(of) if of.startswith('x'): if of == 'xr': # we are doing RSS output req.content_type = "application/rss+xml" req.send_http_header() req.write("""\n""") else: # we are doing XML output: req.content_type = "text/xml" req.send_http_header() req.write("""\n""") elif of.startswith('t') or str(of[0:3]).isdigit(): # we are doing plain text output: req.content_type = "text/plain" req.send_http_header() elif of == "id": pass # nothing to do, we shall only return list of recIDs elif content_type == 'text/html': # we are doing HTML output: req.content_type = "text/html" req.send_http_header() if not description: description = "%s %s." % (cc, _("Search Results")) if not keywords: keywords = "%s, WebSearch, %s" % (get_coll_i18nname(CFG_SITE_NAME, ln, False), get_coll_i18nname(cc, ln, False)) ## generate RSS URL: argd = {} if req.args: argd = cgi.parse_qs(req.args) rssurl = websearch_templates.build_rss_url(argd) ## add jsmath if displaying single records (FIXME: find ## eventual better place to this code) if of.lower() in CFG_WEBSEARCH_USE_JSMATH_FOR_FORMATS: metaheaderadd = """ """ else: metaheaderadd = '' ## generate navtrail: navtrail = create_navtrail_links(cc, aas, ln) if navtrail != '': navtrail += ' > ' if (tab != '' or ((of != '' or of.lower() != 'hd') and of != 'hb')) and \ recID != -1: # If we are not in information tab in HD format, customize # the nav. trail to have a link back to main record. (Due # to the way perform_request_search() works, hb # (lowercase) is equal to hd) navtrail += ' %s' % \ (CFG_SITE_URL, recID, title_message) if (of != '' or of.lower() != 'hd') and of != 'hb': # Export format_name = of query = "SELECT name FROM format WHERE code=%s" res = run_sql(query, (of,)) if res: format_name = res[0][0] navtrail += ' > ' + format_name else: # Discussion, citations, etc. tabs tab_label = get_detailed_page_tabs(cc, ln=ln)[tab]['label'] navtrail += ' > ' + _(tab_label) else: navtrail += title_message if p: # we are serving search/browse results pages, so insert pattern: navtrail += ": " + cgi.escape(p) title_message = cgi.escape(p) + " - " + title_message ## finally, print page header: req.write(pageheaderonly(req=req, title=title_message, navtrail=navtrail, description=description, keywords=keywords, metaheaderadd=metaheaderadd, uid=uid, language=ln, navmenuid='search', navtrail_append_title_p=0, rssurl=rssurl)) req.write(websearch_templates.tmpl_search_pagestart(ln=ln)) #else: # req.send_http_header() def page_end(req, of="hb", ln=CFG_SITE_LANG): "End page according to given output format: e.g. close XML tags, add HTML footer, etc." if of == "id": return [] # empty recID list if not req: return # we were called from CLI if of.startswith('h'): req.write(websearch_templates.tmpl_search_pageend(ln = ln)) # pagebody end req.write(pagefooteronly(lastupdated=__lastupdated__, language=ln, req=req)) return def create_page_title_search_pattern_info(p, p1, p2, p3): """Create the search pattern bit for the page web page HTML header. Basically combine p and (p1,p2,p3) together so that the page header may be filled whether we are in the Simple Search or Advanced Search interface contexts.""" out = "" if p: out = p else: out = p1 if p2: out += ' ' + p2 if p3: out += ' ' + p3 return out def create_inputdate_box(name="d1", selected_year=0, selected_month=0, selected_day=0, ln=CFG_SITE_LANG): "Produces 'From Date', 'Until Date' kind of selection box. Suitable for search options." _ = gettext_set_language(ln) box = "" # day box += """<select name="%sd">""" % name box += """<option value="">%s""" % _("any day") for day in range(1, 32): box += """<option value="%02d"%s>%02d""" % (day, is_selected(day, selected_day), day) box += """</select>""" # month box += """<select name="%sm">""" % name box += """<option value="">%s""" % _("any month") for mm, month in [(1, _("January")), (2, _("February")), (3, _("March")), (4, _("April")), \ (5, _("May")), (6, _("June")), (7, _("July")), (8, _("August")), \ (9, _("September")), (10, _("October")), (11, _("November")), (12, _("December"))]: box += """<option value="%02d"%s>%s""" % (mm, is_selected(mm, selected_month), month) box += """</select>""" # year box += """<select name="%sy">""" % name box += """<option value="">%s""" % _("any year") this_year = int(time.strftime("%Y", time.localtime())) for year in range(this_year-20, this_year+1): box += """<option value="%d"%s>%d""" % (year, is_selected(year, selected_year), year) box += """</select>""" return box def create_search_box(cc, colls, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action=""): """Create search box for 'search again in the results page' functionality.""" # load the right message language _ = gettext_set_language(ln) # some computations cc_intl = get_coll_i18nname(cc, ln, False) cc_colID = get_colID(cc) colls_nicely_ordered = [] if cfg_nicely_ordered_collection_list: colls_nicely_ordered = get_nicely_ordered_collection_list(ln=ln) else: colls_nicely_ordered = get_alphabetically_ordered_collection_list(ln=ln) colls_nice = [] for (cx, cx_printable) in colls_nicely_ordered: if not cx.startswith("Unnamed collection"): colls_nice.append({ 'value' : cx, 'text' : cx_printable }) coll_selects = [] if colls and colls[0] != CFG_SITE_NAME: # some collections are defined, so print these first, and only then print 'add another collection' heading: for c in colls: if c: temp = [] temp.append({ 'value' : CFG_SITE_NAME, 'text' : '*** %s ***' % _("any public collection") }) # this field is used to remove the current collection from the ones to be searched. temp.append({ 'value' : '', 'text' : '*** %s ***' % _("remove this collection") }) for val in colls_nice: # print collection: if not cx.startswith("Unnamed collection"): temp.append({ 'value' : val['value'], 'text' : val['text'], 'selected' : (c == re.sub("^[\s\-]*","", val['value'])) }) coll_selects.append(temp) coll_selects.append([{ 'value' : '', 'text' : '*** %s ***' % _("add another collection") }] + colls_nice) else: # we searched in CFG_SITE_NAME, so print 'any public collection' heading coll_selects.append([{ 'value' : CFG_SITE_NAME, 'text' : '*** %s ***' % _("any public collection") }] + colls_nice) ## ranking methods ranks = [{ 'value' : '', 'text' : "- %s %s -" % (_("OR").lower (), _("rank by")), }] for (code, name) in get_bibrank_methods(cc_colID, ln): # propose found rank methods: ranks.append({ 'value' : code, 'text' : name, }) formats = [] query = """SELECT code,name FROM format WHERE visibility='1' ORDER BY name ASC""" res = run_sql(query) if res: # propose found formats: for code, name in res: formats.append({ 'value' : code, 'text' : name }) else: formats.append({'value' : 'hb', 'text' : _("HTML brief") }) # show collections in the search box? (not if there is only one # collection defined, and not if we are in light search) show_colls = True show_title = True if len(collection_reclist_cache.cache.keys()) == 1 or \ aas == -1: show_colls = False show_title = False if cc == CFG_SITE_NAME: show_title = False return websearch_templates.tmpl_search_box( ln = ln, aas = aas, cc_intl = cc_intl, cc = cc, ot = ot, sp = sp, action = action, fieldslist = get_searchwithin_fields(ln=ln, colID=cc_colID), f1 = f1, f2 = f2, f3 = f3, m1 = m1, m2 = m2, m3 = m3, p1 = p1, p2 = p2, p3 = p3, op1 = op1, op2 = op2, rm = rm, p = p, f = f, coll_selects = coll_selects, d1y = d1y, d2y = d2y, d1m = d1m, d2m = d2m, d1d = d1d, d2d = d2d, dt = dt, sort_fields = get_sortby_fields(ln=ln, colID=cc_colID), sf = sf, so = so, ranks = ranks, sc = sc, rg = rg, formats = formats, of = of, pl = pl, jrec = jrec, ec = ec, show_colls = show_colls, show_title = show_title, ) def create_navtrail_links(cc=CFG_SITE_NAME, aas=0, ln=CFG_SITE_LANG, self_p=1, tab=''): """Creates navigation trail links, i.e. links to collection ancestors (except Home collection). If aas==1, then links to Advanced Search interfaces; otherwise Simple Search. """ dads = [] for dad in get_coll_ancestors(cc): if dad != CFG_SITE_NAME: # exclude Home collection dads.append ((dad, get_coll_i18nname(dad, ln, False))) if self_p and cc != CFG_SITE_NAME: dads.append((cc, get_coll_i18nname(cc, ln, False))) return websearch_templates.tmpl_navtrail_links( aas=aas, ln=ln, dads=dads) def get_searchwithin_fields(ln='en', colID=None): """Retrieves the fields name used in the 'search within' selection box for the collection ID colID.""" res = None if colID: res = run_sql_cached("""SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='sew' AND cff.id_collection=%s AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", (colID,), affected_tables=['field', 'collection_field_fieldvalue']) if not res: res = run_sql_cached("SELECT code,name FROM field ORDER BY name ASC", affected_tables=['field',]) fields = [{ 'value' : '', 'text' : get_field_i18nname("any field", ln, False) }] for field_code, field_name in res: if field_code and field_code != "anyfield": fields.append({ 'value' : field_code, 'text' : get_field_i18nname(field_name, ln, False) }) return fields def get_sortby_fields(ln='en', colID=None): """Retrieves the fields name used in the 'sort by' selection box for the collection ID colID.""" _ = gettext_set_language(ln) res = None if colID: res = run_sql_cached("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='soo' AND cff.id_collection=%s AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", (colID,), affected_tables=['field', 'collection_field_fieldvalue']) if not res: # no sort fields defined for this colID, try to take Home collection: res = run_sql_cached("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='soo' AND cff.id_collection=%s AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", (1,), affected_tables=['field', 'collection_field_fieldvalue']) if not res: # no sort fields defined for the Home collection, take all sort fields defined wherever they are: res = run_sql_cached("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='soo' AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", affected_tables=['field', 'collection_field_fieldvalue']) fields = [{ 'value' : '', 'text' : _("latest first") }] for field_code, field_name in res: if field_code and field_code != "anyfield": fields.append({ 'value' : field_code, 'text' : get_field_i18nname(field_name, ln, False) }) return fields def create_andornot_box(name='op', value='', ln='en'): "Returns HTML code for the AND/OR/NOT selection box." _ = gettext_set_language(ln) out = """ <select name="%s"> <option value="a"%s>%s <option value="o"%s>%s <option value="n"%s>%s </select> """ % (name, is_selected('a', value), _("AND"), is_selected('o', value), _("OR"), is_selected('n', value), _("AND NOT")) return out def create_matchtype_box(name='m', value='', ln='en'): "Returns HTML code for the 'match type' selection box." _ = gettext_set_language(ln) out = """ <select name="%s"> <option value="a"%s>%s <option value="o"%s>%s <option value="e"%s>%s <option value="p"%s>%s <option value="r"%s>%s </select> """ % (name, is_selected('a', value), _("All of the words:"), is_selected('o', value), _("Any of the words:"), is_selected('e', value), _("Exact phrase:"), is_selected('p', value), _("Partial phrase:"), is_selected('r', value), _("Regular expression:")) return out def is_selected(var, fld): "Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes." if type(var) is int and type(fld) is int: if var == fld: return " selected" elif str(var) == str(fld): return " selected" elif fld and len(fld)==3 and fld[0] == "w" and var == fld[1:]: return " selected" return "" def wash_colls(cc, c, split_colls=0, verbose=0): """Wash collection list by checking whether user has deselected anything under 'Narrow search'. Checks also if cc is a list or not. Return list of cc, colls_to_display, colls_to_search since the list of collections to display is different from that to search in. This is because users might have chosen 'split by collection' functionality. The behaviour of "collections to display" depends solely whether user has deselected a particular collection: e.g. if it started from 'Articles and Preprints' page, and deselected 'Preprints', then collection to display is 'Articles'. If he did not deselect anything, then collection to display is 'Articles & Preprints'. The behaviour of "collections to search in" depends on the 'split_colls' parameter: * if is equal to 1, then we can wash the colls list down and search solely in the collection the user started from; * if is equal to 0, then we are splitting to the first level of collections, i.e. collections as they appear on the page we started to search from; The function raises exception InvenioWebSearchUnknownCollectionError if cc or one of c collections is not known. """ colls_out = [] colls_out_for_display = [] # list to hold the hosted collections to be searched and displayed hosted_colls_out = [] debug = "" if verbose: debug += "<br />" debug += "<br />1) --- initial parameters ---" debug += "<br />cc : %s" % cc debug += "<br />c : %s" % c debug += "<br />" # check what type is 'cc': if type(cc) is list: for ci in cc: if collection_reclist_cache.cache.has_key(ci): # yes this collection is real, so use it: cc = ci break else: # check once if cc is real: if not collection_reclist_cache.cache.has_key(cc): if cc: raise InvenioWebSearchUnknownCollectionError(cc) else: cc = CFG_SITE_NAME # cc is not set, so replace it with Home collection # check type of 'c' argument: if type(c) is list: colls = c else: colls = [c] if verbose: debug += "<br />2) --- after check for the integrity of cc and the being or not c a list ---" debug += "<br />cc : %s" % cc debug += "<br />c : %s" % c debug += "<br />" # remove all 'unreal' collections: colls_real = [] for coll in colls: if collection_reclist_cache.cache.has_key(coll): colls_real.append(coll) else: if coll: raise InvenioWebSearchUnknownCollectionError(coll) colls = colls_real if verbose: debug += "<br />3) --- keeping only the real colls of c ---" debug += "<br />colls : %s" % colls debug += "<br />" # check if some real collections remain: if len(colls)==0: colls = [cc] if verbose: debug += "<br />4) --- in case no colls were left we use cc directly ---" debug += "<br />colls : %s" % colls debug += "<br />" # then let us check the list of non-restricted "real" sons of 'cc' and compare it to 'coll': res = run_sql("""SELECT c.name FROM collection AS c, collection_collection AS cc, collection AS ccc WHERE c.id=cc.id_son AND cc.id_dad=ccc.id AND ccc.name=%s AND cc.type='r'""", (cc,)) # list that holds all the non restricted sons of cc that are also not hosted collections l_cc_nonrestricted_sons_and_nonhosted_colls = [] res_hosted = run_sql("""SELECT c.name FROM collection AS c, collection_collection AS cc, collection AS ccc WHERE c.id=cc.id_son AND cc.id_dad=ccc.id AND ccc.name=%s AND cc.type='r' AND (c.dbquery NOT LIKE 'hostedcollection:%%' OR c.dbquery IS NULL)""", (cc,)) for row_hosted in res_hosted: l_cc_nonrestricted_sons_and_nonhosted_colls.append(row_hosted[0]) l_cc_nonrestricted_sons_and_nonhosted_colls.sort() l_cc_nonrestricted_sons = [] l_c = colls for row in res: if not collection_restricted_p(row[0]): l_cc_nonrestricted_sons.append(row[0]) l_c.sort() l_cc_nonrestricted_sons.sort() if l_cc_nonrestricted_sons == l_c: colls_out_for_display = [cc] # yep, washing permitted, it is sufficient to display 'cc' # the following elif is a hack that preserves the above funcionality when we start searching from # the frontpage with some hosted collections deselected (either by default or manually) elif set(l_cc_nonrestricted_sons_and_nonhosted_colls).issubset(set(l_c)): colls_out_for_display = colls split_colls = 0 else: colls_out_for_display = colls # nope, we need to display all 'colls' successively # remove duplicates: #colls_out_for_display_nondups=filter(lambda x, colls_out_for_display=colls_out_for_display: colls_out_for_display[x-1] not in colls_out_for_display[x:], range(1, len(colls_out_for_display)+1)) #colls_out_for_display = map(lambda x, colls_out_for_display=colls_out_for_display:colls_out_for_display[x-1], colls_out_for_display_nondups) colls_out_for_display = list(set(colls_out_for_display)) if verbose: debug += "<br />5) --- decide whether colls_out_for_diplay should be colls or is it sufficient for it to be cc; remove duplicates ---" debug += "<br />colls_out_for_display : %s" % colls_out_for_display debug += "<br />" # the following piece of code takes care of removing collections whose ancestors are going to be searched anyway # list to hold the collections to be removed colls_to_be_removed = [] # first calculate the collections that can safely be removed for coll in colls_out_for_display: for ancestor in get_coll_ancestors(coll): #if ancestor in colls_out_for_display: colls_to_be_removed.append(coll) if ancestor in colls_out_for_display and not is_hosted_collection(coll): colls_to_be_removed.append(coll) # secondly remove the collections for coll in colls_to_be_removed: colls_out_for_display.remove(coll) if verbose: debug += "<br />6) --- remove collections that have ancestors about to be search, unless they are hosted ---" debug += "<br />colls_out_for_display : %s" % colls_out_for_display debug += "<br />" # calculate the hosted collections to be searched. if colls_out_for_display == [cc]: if is_hosted_collection(cc): hosted_colls_out.append(cc) else: for coll in get_coll_sons(cc): if is_hosted_collection(coll): hosted_colls_out.append(coll) else: for coll in colls_out_for_display: if is_hosted_collection(coll): hosted_colls_out.append(coll) if verbose: debug += "<br />7) --- calculate the hosted_colls_out ---" debug += "<br />hosted_colls_out : %s" % hosted_colls_out debug += "<br />" # second, let us decide on collection splitting: if split_colls == 0: # type A - no sons are wanted colls_out = colls_out_for_display else: # type B - sons (first-level descendants) are wanted for coll in colls_out_for_display: coll_sons = get_coll_sons(coll) if coll_sons == []: colls_out.append(coll) else: for coll_son in coll_sons: if not is_hosted_collection(coll_son): colls_out.append(coll_son) #else: # colls_out = colls_out + coll_sons # remove duplicates: #colls_out_nondups=filter(lambda x, colls_out=colls_out: colls_out[x-1] not in colls_out[x:], range(1, len(colls_out)+1)) #colls_out = map(lambda x, colls_out=colls_out:colls_out[x-1], colls_out_nondups) colls_out = list(set(colls_out)) if verbose: debug += "<br />8) --- calculate the colls_out; remove duplicates ---" debug += "<br />colls_out : %s" % colls_out debug += "<br />" # remove the hosted collections from the collections to be searched if hosted_colls_out: for coll in hosted_colls_out: try: colls_out.remove(coll) except ValueError: # in case coll was not found in colls_out pass if verbose: debug += "<br />9) --- remove the hosted_colls from the colls_out ---" debug += "<br />colls_out : %s" % colls_out return (cc, colls_out_for_display, colls_out, hosted_colls_out, debug) def strip_accents(x): """Strip accents in the input phrase X (assumed in UTF-8) by replacing accented characters with their unaccented cousins (e.g. é by e). Return such a stripped X.""" x = re_latex_lowercase_a.sub("a", x) x = re_latex_lowercase_ae.sub("ae", x) x = re_latex_lowercase_e.sub("e", x) x = re_latex_lowercase_i.sub("i", x) x = re_latex_lowercase_o.sub("o", x) x = re_latex_lowercase_u.sub("u", x) x = re_latex_lowercase_y.sub("x", x) x = re_latex_lowercase_c.sub("c", x) x = re_latex_lowercase_n.sub("n", x) x = re_latex_uppercase_a.sub("A", x) x = re_latex_uppercase_ae.sub("AE", x) x = re_latex_uppercase_e.sub("E", x) x = re_latex_uppercase_i.sub("I", x) x = re_latex_uppercase_o.sub("O", x) x = re_latex_uppercase_u.sub("U", x) x = re_latex_uppercase_y.sub("Y", x) x = re_latex_uppercase_c.sub("C", x) x = re_latex_uppercase_n.sub("N", x) # convert input into Unicode string: try: y = unicode(x, "utf-8") except: return x # something went wrong, probably the input wasn't UTF-8 # asciify Latin-1 lowercase characters: y = re_unicode_lowercase_a.sub("a", y) y = re_unicode_lowercase_ae.sub("ae", y) y = re_unicode_lowercase_e.sub("e", y) y = re_unicode_lowercase_i.sub("i", y) y = re_unicode_lowercase_o.sub("o", y) y = re_unicode_lowercase_u.sub("u", y) y = re_unicode_lowercase_y.sub("y", y) y = re_unicode_lowercase_c.sub("c", y) y = re_unicode_lowercase_n.sub("n", y) # asciify Latin-1 uppercase characters: y = re_unicode_uppercase_a.sub("A", y) y = re_unicode_uppercase_ae.sub("AE", y) y = re_unicode_uppercase_e.sub("E", y) y = re_unicode_uppercase_i.sub("I", y) y = re_unicode_uppercase_o.sub("O", y) y = re_unicode_uppercase_u.sub("U", y) y = re_unicode_uppercase_y.sub("Y", y) y = re_unicode_uppercase_c.sub("C", y) y = re_unicode_uppercase_n.sub("N", y) # return UTF-8 representation of the Unicode string: return y.encode("utf-8") def wash_index_term(term, max_char_length=50, lower_term=True): """ Return washed form of the index term TERM that would be suitable for storing into idxWORD* tables. I.e., lower the TERM if LOWER_TERM is True, and truncate it safely to MAX_CHAR_LENGTH UTF-8 characters (meaning, in principle, 4*MAX_CHAR_LENGTH bytes). The function works by an internal conversion of TERM, when needed, from its input Python UTF-8 binary string format into Python Unicode format, and then truncating it safely to the given number of UTF-8 characters, without possible mis-truncation in the middle of a multi-byte UTF-8 character that could otherwise happen if we would have been working with UTF-8 binary representation directly. Note that MAX_CHAR_LENGTH corresponds to the length of the term column in idxINDEX* tables. """ if lower_term: washed_term = unicode(term, 'utf-8').lower() else: washed_term = unicode(term, 'utf-8') if len(washed_term) <= max_char_length: # no need to truncate the term, because it will fit # nicely even if it uses four-byte UTF-8 characters return washed_term.encode('utf-8') else: # truncate the term in a safe position: return washed_term[:max_char_length].encode('utf-8') def lower_index_term(term): """ Return safely lowered index term TERM. This is done by converting to UTF-8 first, because standard Python lower() function is not UTF-8 safe. To be called by both the search engine and the indexer when appropriate (e.g. before stemming). In case of problems with UTF-8 compliance, this function raises UnicodeDecodeError, so the client code may want to catch it. """ return unicode(term, 'utf-8').lower().encode('utf-8') def wash_output_format(format): """Wash output format FORMAT. Currently only prevents input like 'of=9' for backwards-compatible format that prints certain fields only. (for this task, 'of=tm' is preferred)""" if str(format[0:3]).isdigit() and len(format) != 6: # asked to print MARC tags, but not enough digits, # so let's switch back to HTML brief default return 'hb' else: return format def wash_pattern(p): """Wash pattern passed by URL. Check for sanity of the wildcard by removing wildcards if they are appended to extremely short words (1-3 letters). TODO: instead of this approximative treatment, it will be much better to introduce a temporal limit, e.g. to kill a query if it does not finish in 10 seconds.""" # strip accents: # p = strip_accents(p) # FIXME: when available, strip accents all the time # add leading/trailing whitespace for the two following wildcard-sanity checking regexps: p = " " + p + " " # replace spaces within quotes by __SPACE__ temporarily: p = re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", p) p = re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", p) p = re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", p) # get rid of unquoted wildcards after spaces: p = re_pattern_wildcards_after_spaces.sub("\\1", p) # get rid of extremely short words (1-3 letters with wildcards): p = re_pattern_short_words.sub("\\1", p) # replace back __SPACE__ by spaces: p = re_pattern_space.sub(" ", p) # replace special terms: p = re_pattern_today.sub(time.strftime("%Y-%m-%d", time.localtime()), p) # remove unnecessary whitespace: p = string.strip(p) return p def wash_field(f): """Wash field passed by URL.""" # get rid of unnecessary whitespace: f = string.strip(f) # wash old-style CDS Invenio/ALEPH 'f' field argument, e.g. replaces 'wau' and 'au' by 'author' if CFG_WEBSEARCH_FIELDS_CONVERT.has_key(string.lower(f)): f = CFG_WEBSEARCH_FIELDS_CONVERT[f] return f def wash_dates(d1="", d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0): """ Take user-submitted date arguments D1 (full datetime string) or (D1Y, D1M, D1Y) year, month, day tuple and D2 or (D2Y, D2M, D2Y) and return (YYY1-M1-D2 H1:M1:S2, YYY2-M2-D2 H2:M2:S2) datetime strings in the YYYY-MM-DD HH:MM:SS format suitable for time restricted searching. Note that when both D1 and (D1Y, D1M, D1D) parameters are present, the precedence goes to D1. Ditto for D2*. Note that when (D1Y, D1M, D1D) are taken into account, some values may be missing and are completed e.g. to 01 or 12 according to whether it is the starting or the ending date. """ datetext1, datetext2 = "", "" # sanity checking: if d1 == "" and d1y == 0 and d1m == 0 and d1d == 0 and d2 == "" and d2y == 0 and d2m == 0 and d2d == 0: return ("", "") # nothing selected, so return empty values # wash first (starting) date: if d1: # full datetime string takes precedence: datetext1 = d1 else: # okay, first date passed as (year,month,day): if d1y: datetext1 += "%04d" % d1y else: datetext1 += "0000" if d1m: datetext1 += "-%02d" % d1m else: datetext1 += "-01" if d1d: datetext1 += "-%02d" % d1d else: datetext1 += "-01" datetext1 += " 00:00:00" # wash second (ending) date: if d2: # full datetime string takes precedence: datetext2 = d2 else: # okay, second date passed as (year,month,day): if d2y: datetext2 += "%04d" % d2y else: datetext2 += "9999" if d2m: datetext2 += "-%02d" % d2m else: datetext2 += "-12" if d2d: datetext2 += "-%02d" % d2d else: datetext2 += "-31" # NOTE: perhaps we should add max(datenumber) in # given month, but for our quering it's not # needed, 31 will always do datetext2 += " 00:00:00" # okay, return constructed YYYY-MM-DD HH:MM:SS datetexts: return (datetext1, datetext2) def is_hosted_collection(coll): """Check if the given collection is a hosted one; i.e. its dbquery starts with hostedcollection: Returns True if it is, False if it's not or if the result is empty or if the query failed""" res = run_sql("SELECT dbquery FROM collection WHERE name=%s", (coll, )) try: return res[0][0].startswith("hostedcollection:") except: return False def get_colID(c): "Return collection ID for collection name C. Return None if no match found." colID = None res = run_sql("SELECT id FROM collection WHERE name=%s", (c,), 1) if res: colID = res[0][0] return colID def get_coll_ancestors(coll): "Returns a list of ancestors for collection 'coll'." coll_ancestors = [] coll_ancestor = coll while 1: res = run_sql("""SELECT c.name FROM collection AS c LEFT JOIN collection_collection AS cc ON c.id=cc.id_dad LEFT JOIN collection AS ccc ON ccc.id=cc.id_son WHERE ccc.name=%s ORDER BY cc.id_dad ASC LIMIT 1""", (coll_ancestor,)) if res: coll_name = res[0][0] coll_ancestors.append(coll_name) coll_ancestor = coll_name else: break # ancestors found, return reversed list: coll_ancestors.reverse() return coll_ancestors def get_coll_sons(coll, type='r', public_only=1): """Return a list of sons (first-level descendants) of type 'type' for collection 'coll'. If public_only, then return only non-restricted son collections. """ coll_sons = [] query = "SELECT c.name FROM collection AS c "\ "LEFT JOIN collection_collection AS cc ON c.id=cc.id_son "\ "LEFT JOIN collection AS ccc ON ccc.id=cc.id_dad "\ "WHERE cc.type=%s AND ccc.name=%s" query += " ORDER BY cc.score DESC" res = run_sql(query, (type, coll)) for name in res: if not public_only or not collection_restricted_p(name[0]): coll_sons.append(name[0]) return coll_sons def get_coll_real_descendants(coll, type='_', get_hosted_colls=True): """Return a list of all descendants of collection 'coll' that are defined by a 'dbquery'. IOW, we need to decompose compound collections like "A & B" into "A" and "B" provided that "A & B" has no associated database query defined. """ coll_sons = [] res = run_sql("""SELECT c.name,c.dbquery FROM collection AS c LEFT JOIN collection_collection AS cc ON c.id=cc.id_son LEFT JOIN collection AS ccc ON ccc.id=cc.id_dad WHERE ccc.name=%s AND cc.type LIKE %s ORDER BY cc.score DESC""", (coll, type,)) for name, dbquery in res: if dbquery: # this is 'real' collection, so return it: if get_hosted_colls: coll_sons.append(name) else: if not dbquery.startswith("hostedcollection:"): coll_sons.append(name) else: # this is 'composed' collection, so recurse: coll_sons.extend(get_coll_real_descendants(name)) return coll_sons def browse_pattern(req, colls, p, f, rg, ln=CFG_SITE_LANG): """Browse either biliographic phrases or words indexes, and display it.""" # load the right message language _ = gettext_set_language(ln) ## is p enclosed in quotes? (coming from exact search) if p.startswith('"') and p.endswith('"'): p = p[1:-1] p_orig = p ## okay, "real browse" follows: ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test if not f and string.find(p, ":") > 0: # does 'p' contain ':'? f, p = string.split(p, ":", 1) ## do we search in words indexes? if not f: return browse_in_bibwords(req, p, f) index_id = get_index_id_from_field(f) if index_id != 0: coll = HitSet() for coll_name in colls: coll |= get_collection_reclist(coll_name) browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection(p, index_id, rg/2, rg/2, coll) else: browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg+1)/2+1, (rg-1)/2+1) while not browsed_phrases: # try again and again with shorter and shorter pattern: try: p = p[:-1] browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg+1)/2+1, (rg-1)/2+1) except: # probably there are no hits at all: req.write(_("No values found.")) return ## try to check hits in these particular collection selection: browsed_phrases_in_colls = [] if 0: for phrase in browsed_phrases: phrase_hitset = HitSet() phrase_hitsets = search_pattern("", phrase, f, 'e') for coll in colls: phrase_hitset.union_update(phrase_hitsets[coll]) if len(phrase_hitset) > 0: # okay, this phrase has some hits in colls, so add it: browsed_phrases_in_colls.append([phrase, len(phrase_hitset)]) ## were there hits in collections? if browsed_phrases_in_colls == []: if browsed_phrases != []: #print_warning(req, """<p>No match close to <em>%s</em> found in given collections. #Please try different term.<p>Displaying matches in any collection...""" % p_orig) ## try to get nbhits for these phrases in any collection: for phrase in browsed_phrases: browsed_phrases_in_colls.append([phrase, get_nbhits_in_bibxxx(phrase, f)]) ## display results now: out = websearch_templates.tmpl_browse_pattern( f=f, fn=get_field_i18nname(get_field_name(f) or f, ln, False), ln=ln, browsed_phrases_in_colls=browsed_phrases_in_colls, colls=colls, rg=rg, ) req.write(out) return def browse_in_bibwords(req, p, f, ln=CFG_SITE_LANG): """Browse inside words indexes.""" if not p: return _ = gettext_set_language(ln) urlargd = {} urlargd.update(req.argd) urlargd['action'] = 'search' nearest_box = create_nearest_terms_box(urlargd, p, f, 'w', ln=ln, intro_text_p=0) req.write(websearch_templates.tmpl_search_in_bibwords( p = p, f = f, ln = ln, nearest_box = nearest_box )) return def search_pattern(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0, ln=CFG_SITE_LANG, display_nearest_terms_box=True): """Search for complex pattern 'p' within field 'f' according to matching type 'm'. Return hitset of recIDs. The function uses multi-stage searching algorithm in case of no exact match found. See the Search Internals document for detailed description. The 'ap' argument governs whether an alternative patterns are to be used in case there is no direct hit for (p,f,m). For example, whether to replace non-alphanumeric characters by spaces if it would give some hits. See the Search Internals document for detailed description. (ap=0 forbits the alternative pattern usage, ap=1 permits it.) The 'of' argument governs whether to print or not some information to the user in case of no match found. (Usually it prints the information in case of HTML formats, otherwise it's silent). The 'verbose' argument controls the level of debugging information to be printed (0=least, 9=most). All the parameters are assumed to have been previously washed. This function is suitable as a mid-level API. """ _ = gettext_set_language(ln) hitset_empty = HitSet() # sanity check: if not p: hitset_full = HitSet(trailing_bits=1) hitset_full.discard(0) # no pattern, so return all universe return hitset_full # search stage 1: break up arguments into basic search units: if verbose and of.startswith("h"): t1 = os.times()[4] basic_search_units = create_basic_search_units(req, p, f, m, of) if verbose and of.startswith("h"): t2 = os.times()[4] print_warning(req, "Search stage 1: basic search units are: %s" % cgi.escape(repr(basic_search_units))) print_warning(req, "Search stage 1: execution took %.2f seconds." % (t2 - t1)) # search stage 2: do search for each search unit and verify hit presence: if verbose and of.startswith("h"): t1 = os.times()[4] basic_search_units_hitsets = [] #prepare hiddenfield-related.. myhiddens = CFG_BIBFORMAT_HIDDEN_TAGS can_see_hidden = False if req: user_info = collect_user_info(req) can_see_hidden = (acc_authorize_action(user_info, 'runbibedit')[0] == 0) if can_see_hidden: myhiddens = [] for idx_unit in xrange(len(basic_search_units)): bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit] basic_search_unit_hitset = search_unit(bsu_p, bsu_f, bsu_m) #check that the user is allowed to search with this tag #if he/she tries it if bsu_f and len(bsu_f) > 1 and bsu_f[0].isdigit() and bsu_f[1].isdigit(): for htag in myhiddens: ltag = len(htag) samelenfield = bsu_f[0:ltag] if samelenfield == htag: #user searches by a hidden tag #we won't show you anything.. basic_search_unit_hitset = HitSet() if verbose >= 9 and of.startswith("h"): print_warning(req, "Pattern %s hitlist omitted since \ it queries in a hidden tag %s" % (repr(bsu_p), repr(myhiddens))) display_nearest_terms_box=False #..and stop spying, too. if verbose >= 9 and of.startswith("h"): print_warning(req, "Search stage 1: pattern %s gave hitlist %s" % (cgi.escape(bsu_p), basic_search_unit_hitset)) if len(basic_search_unit_hitset) > 0 or \ ap==0 or \ bsu_o=="|" or \ ((idx_unit+1)<len(basic_search_units) and basic_search_units[idx_unit+1][0]=="|"): # stage 2-1: this basic search unit is retained, since # either the hitset is non-empty, or the approximate # pattern treatment is switched off, or the search unit # was joined by an OR operator to preceding/following # units so we do not require that it exists basic_search_units_hitsets.append(basic_search_unit_hitset) else: # stage 2-2: no hits found for this search unit, try to replace non-alphanumeric chars inside pattern: - if re.search(r'[^a-zA-Z0-9\s\:]', bsu_p): + if re.search(r'[^a-zA-Z0-9\s\:]', bsu_p) and bsu_f != 'refersto' and bsu_f != 'citedby': if bsu_p.startswith('"') and bsu_p.endswith('"'): # is it ACC query? bsu_pn = re.sub(r'[^a-zA-Z0-9\s\:]+', "*", bsu_p) else: # it is WRD query bsu_pn = re.sub(r'[^a-zA-Z0-9\s\:]+', " ", bsu_p) if verbose and of.startswith('h') and req: print_warning(req, "Trying (%s,%s,%s)" % (cgi.escape(bsu_pn), cgi.escape(bsu_f), cgi.escape(bsu_m))) basic_search_unit_hitset = search_pattern(req=None, p=bsu_pn, f=bsu_f, m=bsu_m, of="id", ln=ln) if len(basic_search_unit_hitset) > 0: # we retain the new unit instead if of.startswith('h'): print_warning(req, _("No exact match found for %(x_query1)s, using %(x_query2)s instead...") % \ {'x_query1': "<em>" + cgi.escape(bsu_p) + "</em>", 'x_query2': "<em>" + cgi.escape(bsu_pn) + "</em>"}) basic_search_units[idx_unit][1] = bsu_pn basic_search_units_hitsets.append(basic_search_unit_hitset) else: # stage 2-3: no hits found either, propose nearest indexed terms: if of.startswith('h') and display_nearest_terms_box: if req: if bsu_f == "recid": print_warning(req, _("Requested record does not seem to exist.")) else: print_warning(req, create_nearest_terms_box(req.argd, bsu_p, bsu_f, bsu_m, ln=ln)) return hitset_empty else: # stage 2-3: no hits found either, propose nearest indexed terms: if of.startswith('h') and display_nearest_terms_box: if req: if bsu_f == "recid": print_warning(req, _("Requested record does not seem to exist.")) else: print_warning(req, create_nearest_terms_box(req.argd, bsu_p, bsu_f, bsu_m, ln=ln)) return hitset_empty if verbose and of.startswith("h"): t2 = os.times()[4] for idx_unit in range(0, len(basic_search_units)): print_warning(req, "Search stage 2: basic search unit %s gave %d hits." % (basic_search_units[idx_unit][1:], len(basic_search_units_hitsets[idx_unit]))) print_warning(req, "Search stage 2: execution took %.2f seconds." % (t2 - t1)) # search stage 3: apply boolean query for each search unit: if verbose and of.startswith("h"): t1 = os.times()[4] # let the initial set be the complete universe: hitset_in_any_collection = HitSet(trailing_bits=1) hitset_in_any_collection.discard(0) for idx_unit in xrange(len(basic_search_units)): this_unit_operation = basic_search_units[idx_unit][0] this_unit_hitset = basic_search_units_hitsets[idx_unit] if this_unit_operation == '+': hitset_in_any_collection.intersection_update(this_unit_hitset) elif this_unit_operation == '-': hitset_in_any_collection.difference_update(this_unit_hitset) elif this_unit_operation == '|': hitset_in_any_collection.union_update(this_unit_hitset) else: if of.startswith("h"): print_warning(req, "Invalid set operation %s." % cgi.escape(this_unit_operation), "Error") if len(hitset_in_any_collection) == 0: # no hits found, propose alternative boolean query: if of.startswith('h') and display_nearest_terms_box: nearestterms = [] for idx_unit in range(0, len(basic_search_units)): bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit] if bsu_p.startswith("%") and bsu_p.endswith("%"): bsu_p = "'" + bsu_p[1:-1] + "'" bsu_nbhits = len(basic_search_units_hitsets[idx_unit]) # create a similar query, but with the basic search unit only argd = {} argd.update(req.argd) argd['p'] = bsu_p argd['f'] = bsu_f nearestterms.append((bsu_p, bsu_nbhits, argd)) text = websearch_templates.tmpl_search_no_boolean_hits( ln=ln, nearestterms=nearestterms) print_warning(req, text) if verbose and of.startswith("h"): t2 = os.times()[4] print_warning(req, "Search stage 3: boolean query gave %d hits." % len(hitset_in_any_collection)) print_warning(req, "Search stage 3: execution took %.2f seconds." % (t2 - t1)) return hitset_in_any_collection def search_pattern_parenthesised(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0, ln=CFG_SITE_LANG, display_nearest_terms_box=True): """Search for complex pattern 'p' containing parenthesis within field 'f' according to matching type 'm'. Return hitset of recIDs. For more details on the parameters see 'search_pattern' """ _ = gettext_set_language(ln) # if the pattern uses SPIRES search syntax, convert it to Invenio syntax spires_syntax_converter = SpiresToInvenioSyntaxConverter() p = spires_syntax_converter.convert_query(p) # sanity check: do not call parenthesised parser for search terms # like U(1): if not re_pattern_parens.search(p): return search_pattern(req, p, f, m, ap, of, verbose, ln, display_nearest_terms_box=display_nearest_terms_box) # Try searching with parentheses try: parser = SearchQueryParenthesisedParser() # get a hitset with all recids result_hitset = HitSet(trailing_bits=1) # parse the query. The result is list of [op1, expr1, op2, expr2, ..., opN, exprN] parsing_result = parser.parse_query(p) if verbose and of.startswith("h"): print_warning(req, "Search stage 1: search_pattern_parenthesised() returned %s." % repr(parsing_result)) # go through every pattern # calculate hitset for it # combine pattern's hitset with the result using the corresponding operator for index in xrange(0, len(parsing_result)-1, 2 ): current_operator = parsing_result[index] current_pattern = parsing_result[index+1] # obtain a hitset for the current pattern current_hitset = search_pattern(req, current_pattern, f, m, ap, of, verbose, ln, display_nearest_terms_box=display_nearest_terms_box) # combine the current hitset with resulting hitset using the current operator if current_operator == '+': result_hitset = result_hitset & current_hitset elif current_operator == '-': result_hitset = result_hitset - current_hitset elif current_operator == '|': result_hitset = result_hitset | current_hitset else: assert False, "Unknown operator in search_pattern_parenthesised()" return result_hitset # If searching with parenteses fails, perform search ignoring parentheses except SyntaxError: print_warning(req, _("Search syntax misunderstood. Ignoring all parentheses in the query. If this doesn't help, please check your search and try again.")) # remove the parentheses in the query. Current implementation removes all the parentheses, # but it could be improved to romove only these that are not inside quotes p = p.replace('(', ' ') p = p.replace(')', ' ') return search_pattern(req, p, f, m, ap, of, verbose, ln, display_nearest_terms_box=display_nearest_terms_box) def search_unit(p, f=None, m=None): """Search for basic search unit defined by pattern 'p' and field 'f' and matching type 'm'. Return hitset of recIDs. All the parameters are assumed to have been previously washed. 'p' is assumed to be already a ``basic search unit'' so that it is searched as such and is not broken up in any way. Only wildcard and span queries are being detected inside 'p'. This function is suitable as a low-level API. """ ## create empty output results set: set = HitSet() if not p: # sanity checking return set if f == 'datecreated': set = search_unit_in_bibrec(p, p, 'c') elif f == 'datemodified': set = search_unit_in_bibrec(p, p, 'm') + elif f == 'refersto': + # we are doing search by the citation count + set = search_unit_refersto(p) + elif f == 'citedby': + # we are doing search by the citation count + set = search_unit_citedby(p) elif m == 'a' or m == 'r': # we are doing either phrase search or regexp search index_id = get_index_id_from_field(f) if index_id != 0: set = search_unit_in_idxphrases(p, f, m) else: set = search_unit_in_bibxxx(p, f, m) elif p.startswith("cited:"): # we are doing search by the citation count set = search_unit_by_times_cited(p[6:]) else: # we are doing bibwords search by default set = search_unit_in_bibwords(p, f) return set def search_unit_in_bibwords(word, f, decompress=zlib.decompress): """Searches for 'word' inside bibwordsX table for field 'f' and returns hitset of recIDs.""" set = HitSet() # will hold output result set set_used = 0 # not-yet-used flag, to be able to circumvent set operations # deduce into which bibwordsX table we will search: stemming_language = get_index_stemming_language(get_index_id_from_field("anyfield")) bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: bibwordsX = "idxWORD%02dF" % index_id stemming_language = get_index_stemming_language(index_id) else: return HitSet() # word index f does not exist # wash 'word' argument and run query: word = string.replace(word, '*', '%') # we now use '*' as the truncation character words = string.split(word, "->", 1) # check for span query if len(words) == 2: word0 = re_word.sub('', words[0]) word1 = re_word.sub('', words[1]) if stemming_language: word0 = lower_index_term(word0) word1 = lower_index_term(word1) word0 = stem(word0, stemming_language) word1 = stem(word1, stemming_language) res = run_sql("SELECT term,hitlist FROM %s WHERE term BETWEEN %%s AND %%s" % bibwordsX, (wash_index_term(word0), wash_index_term(word1))) else: if f == 'journal': pass # FIXME: quick hack for the journal index else: word = re_word.sub('', word) if stemming_language: word = lower_index_term(word) word = stem(word, stemming_language) if string.find(word, '%') >= 0: # do we have wildcard in the word? if f == 'journal': # FIXME: quick hack for the journal index # FIXME: we can run a sanity check here for all indexes res = () else: res = run_sql("SELECT term,hitlist FROM %s WHERE term LIKE %%s" % bibwordsX, (wash_index_term(word),)) else: res = run_sql("SELECT term,hitlist FROM %s WHERE term=%%s" % bibwordsX, (wash_index_term(word),)) # fill the result set: for word, hitlist in res: hitset_bibwrd = HitSet(hitlist) # add the results: if set_used: set.union_update(hitset_bibwrd) else: set = hitset_bibwrd set_used = 1 # okay, return result set: return set def search_unit_in_idxphrases(p, f, type): """Searches for phrase 'p' inside idxPHRASE*F table for field 'f' and returns hitset of recIDs found. The search type is defined by 'type' (e.g. equals to 'r' for a regexp search).""" set = HitSet() # will hold output result set set_used = 0 # not-yet-used flag, to be able to circumvent set operations # deduce in which idxPHRASE table we will search: idxphraseX = "idxPHRASE%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: idxphraseX = "idxPHRASE%02dF" % index_id else: return HitSet() # phrase index f does not exist # detect query type (exact phrase, partial phrase, regexp): if type == 'r': query_addons = "REGEXP %s" query_params = (p,) else: p = string.replace(p, '*', '%') # we now use '*' as the truncation character ps = string.split(p, "->", 1) # check for span query: if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): query_addons = "BETWEEN %s AND %s" query_params = (ps[0], ps[1]) else: if string.find(p, '%') > -1: query_addons = "LIKE %s" query_params = (p,) else: query_addons = "= %s" query_params = (p,) # special washing for fuzzy author index: if f == 'author' or f == 'exactauthor': query_params_washed = () for query_param in query_params: query_params_washed += (wash_author_name(query_param),) query_params = query_params_washed # perform search: res = run_sql("SELECT term,hitlist FROM %s WHERE term %s" % (idxphraseX, query_addons), query_params) # fill the result set: for word, hitlist in res: hitset_bibphrase = HitSet(hitlist) # add the results: if set_used: set.union_update(hitset_bibphrase) else: set = hitset_bibphrase set_used = 1 # okay, return result set: return set def search_unit_in_bibxxx(p, f, type): """Searches for pattern 'p' inside bibxxx tables for field 'f' and returns hitset of recIDs found. The search type is defined by 'type' (e.g. equals to 'r' for a regexp search).""" # FIXME: quick hack for the journal index if f == 'journal': return search_unit_in_bibwords(p, f) p_orig = p # saving for eventual future 'no match' reporting query_addons = "" # will hold additional SQL code for the query query_params = () # will hold parameters for the query (their number may vary depending on TYPE argument) # wash arguments: f = string.replace(f, '*', '%') # replace truncation char '*' in field definition if type == 'r': query_addons = "REGEXP %s" query_params = (p,) else: p = string.replace(p, '*', '%') # we now use '*' as the truncation character ps = string.split(p, "->", 1) # check for span query: if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): query_addons = "BETWEEN %s AND %s" query_params = (ps[0], ps[1]) else: if string.find(p, '%') > -1: query_addons = "LIKE %s" query_params = (p,) else: query_addons = "= %s" query_params = (p,) # construct 'tl' which defines the tag list (MARC tags) to search in: tl = [] if str(f[0]).isdigit() and str(f[1]).isdigit(): tl.append(f) # 'f' seems to be okay as it starts by two digits else: # convert old ALEPH tag names, if appropriate: (TODO: get rid of this before entering this function) if CFG_WEBSEARCH_FIELDS_CONVERT.has_key(string.lower(f)): f = CFG_WEBSEARCH_FIELDS_CONVERT[string.lower(f)] # deduce desired MARC tags on the basis of chosen 'f' tl = get_field_tags(f) if not tl: # f index does not exist, nevermind pass # okay, start search: l = [] # will hold list of recID that matched for t in tl: # deduce into which bibxxx table we will search: digit1, digit2 = int(t[0]), int(t[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) # construct and run query: if t == "001": res = run_sql("SELECT id FROM bibrec WHERE id %s" % query_addons, query_params) else: query = "SELECT bibx.id_bibrec FROM %s AS bx LEFT JOIN %s AS bibx ON bx.id=bibx.id_bibxxx WHERE bx.value %s" % \ (bx, bibx, query_addons) if len(t) != 6 or t[-1:]=='%': # wildcard query, or only the beginning of field 't' # is defined, so add wildcard character: query += " AND bx.tag LIKE %s" res = run_sql(query, query_params + (t + '%',)) else: # exact query for 't': query += " AND bx.tag=%s" res = run_sql(query, query_params + (t,)) # fill the result set: for id_bibrec in res: if id_bibrec[0]: l.append(id_bibrec[0]) # check no of hits found: nb_hits = len(l) # okay, return result set: set = HitSet(l) return set def search_unit_in_bibrec(datetext1, datetext2, type='c'): """ Return hitset of recIDs found that were either created or modified (according to 'type' arg being 'c' or 'm') from datetext1 until datetext2, inclusive. Does not pay attention to pattern, collection, anything. Useful to intersect later on with the 'real' query. """ set = HitSet() if type.startswith("m"): type = "modification_date" else: type = "creation_date" # by default we are searching for creation dates if datetext1 == datetext2: res = run_sql("SELECT id FROM bibrec WHERE %s LIKE %%s" % (type,), (datetext1 + '%',)) else: res = run_sql("SELECT id FROM bibrec WHERE %s>=%%s AND %s<=%%s" % (type, type), (datetext1, datetext2)) for row in res: set += row[0] return set def search_unit_by_times_cited(p): """ Return histset of recIDs found that are cited P times. Usually P looks like '10->23'. """ numstr = '"'+p+'"' #this is sort of stupid but since we may need to #get the records that do _not_ have cites, we have to #know the ids of all records, too #but this is needed only if bsu_p is 0 or 0 or 0->0 allrecs = [] if p == 0 or p == "0" or \ p.startswith("0->") or p.endswith("->0"): allrecs = HitSet(run_sql_cached("SELECT id FROM bibrec", affected_tables=['bibrec'])) return get_records_with_num_cites(numstr, allrecs) +def search_unit_refersto(query): + """ + Search for records satisfying the query (e.g. author:ellis) and + return list of records referred to by these records. + """ + if query: + ahitset = search_pattern(p=query) + if ahitset: + return get_refersto_hitset(ahitset) + else: + return HitSet([]) + else: + return HitSet([]) + +def search_unit_citedby(query): + """ + Search for records satisfying the query (e.g. author:ellis) and + return list of records cited by these records. + """ + if query: + ahitset = search_pattern(p=query) + if ahitset: + return get_citedby_hitset(ahitset) + else: + return HitSet([]) + else: + return HitSet([]) + def intersect_results_with_collrecs(req, hitset_in_any_collection, colls, ap=0, of="hb", verbose=0, ln=CFG_SITE_LANG, display_nearest_terms_box=True): """Return dict of hitsets given by intersection of hitset with the collection universes.""" _ = gettext_set_language(ln) # search stage 4: intersect with the collection universe: if verbose and of.startswith("h"): t1 = os.times()[4] results = {} results_nbhits = 0 for coll in colls: results[coll] = hitset_in_any_collection & get_collection_reclist(coll) results_nbhits += len(results[coll]) if results_nbhits == 0: # no hits found, try to search in Home: results_in_Home = hitset_in_any_collection & get_collection_reclist(CFG_SITE_NAME) if len(results_in_Home) > 0: # some hits found in Home, so propose this search: if of.startswith("h") and display_nearest_terms_box: url = websearch_templates.build_search_url(req.argd, cc=CFG_SITE_NAME, c=[]) print_warning(req, _("No match found in collection %(x_collection)s. Other public collections gave %(x_url_open)s%(x_nb_hits)d hits%(x_url_close)s.") %\ {'x_collection': '<em>' + string.join([get_coll_i18nname(coll, ln, False) for coll in colls], ', ') + '</em>', 'x_url_open': '<a class="nearestterms" href="%s">' % (url), 'x_nb_hits': len(results_in_Home), 'x_url_close': '</a>'}) results = {} else: # no hits found in Home, recommend different search terms: if of.startswith("h") and display_nearest_terms_box: print_warning(req, _("No public collection matched your query. " "If you were looking for a non-public document, please choose " "the desired restricted collection first.")) results = {} if verbose and of.startswith("h"): t2 = os.times()[4] print_warning(req, "Search stage 4: intersecting with collection universe gave %d hits." % results_nbhits) print_warning(req, "Search stage 4: execution took %.2f seconds." % (t2 - t1)) return results def intersect_results_with_hitset(req, results, hitset, ap=0, aptext="", of="hb"): """Return intersection of search 'results' (a dict of hitsets with collection as key) with the 'hitset', i.e. apply 'hitset' intersection to each collection within search 'results'. If the final 'results' set is to be empty, and 'ap' (approximate pattern) is true, and then print the `warningtext' and return the original 'results' set unchanged. If 'ap' is false, then return empty results set. """ if ap: results_ap = copy.deepcopy(results) else: results_ap = {} # will return empty dict in case of no hits found nb_total = 0 for coll in results.keys(): results[coll].intersection_update(hitset) nb_total += len(results[coll]) if nb_total == 0: if of.startswith("h"): print_warning(req, aptext) results = results_ap return results def create_similarly_named_authors_link_box(author_name, ln=CFG_SITE_LANG): """Return a box similar to ``Not satisfied...'' one by proposing author searches for similar names. Namely, take AUTHOR_NAME and the first initial of the firstame (after comma) and look into author index whether authors with e.g. middle names exist. Useful mainly for CERN Library that sometimes contains name forms like Ellis-N, Ellis-Nick, Ellis-Nicolas all denoting the same person. The box isn't proposed if no similarly named authors are found to exist. """ # return nothing if not configured: if CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX == 0: return "" # return empty box if there is no initial: if re.match(r'[^ ,]+, [^ ]', author_name) is None: return "" # firstly find name comma initial: author_name_to_search = re.sub(r'^([^ ,]+, +[^ ,]).*$', '\\1', author_name) # secondly search for similar name forms: similar_author_names = {} for name in author_name_to_search, strip_accents(author_name_to_search): for tag in get_field_tags("author"): # deduce into which bibxxx table we will search: digit1, digit2 = int(tag[0]), int(tag[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) if len(tag) != 6 or tag[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value LIKE %%s AND bx.tag LIKE %%s""" % bx, (name + "%", tag + "%")) else: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value LIKE %%s AND bx.tag=%%s""" % bx, (name + "%", tag)) for row in res: similar_author_names[row[0]] = 1 # remove the original name and sort the list: try: del similar_author_names[author_name] except KeyError: pass # thirdly print the box: out = "" if similar_author_names: out_authors = similar_author_names.keys() out_authors.sort() tmp_authors = [] for out_author in out_authors: nbhits = get_nbhits_in_bibxxx(out_author, "author") if nbhits: tmp_authors.append((out_author, nbhits)) out += websearch_templates.tmpl_similar_author_names( authors=tmp_authors, ln=ln) return out def create_nearest_terms_box(urlargd, p, f, t='w', n=5, ln=CFG_SITE_LANG, intro_text_p=True): """Return text box containing list of 'n' nearest terms above/below 'p' for the field 'f' for matching type 't' (words/phrases) in language 'ln'. Propose new searches according to `urlargs' with the new words. If `intro_text_p' is true, then display the introductory message, otherwise print only the nearest terms in the box content. """ # load the right message language _ = gettext_set_language(ln) out = "" nearest_terms = [] if not p: # sanity check p = "." index_id = get_index_id_from_field(f) + # special indexes: + if f == 'refersto': + return _("There are no records referring to %s.") % cgi.escape(p) + if f == 'citedby': + return _("There are no records cited by %s.") % cgi.escape(p) # look for nearest terms: if t == 'w': nearest_terms = get_nearest_terms_in_bibwords(p, f, n, n) if not nearest_terms: return _("No word index is available for %s.") % \ ('<em>' + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + '</em>') else: nearest_terms = [] if index_id: nearest_terms = get_nearest_terms_in_idxphrase(p, index_id, n, n) if f == 'datecreated' or f == 'datemodified': nearest_terms = get_nearest_terms_in_bibrec(p, f, n, n) if not nearest_terms: nearest_terms = get_nearest_terms_in_bibxxx(p, f, n, n) if not nearest_terms: return _("No phrase index is available for %s.") % \ ('<em>' + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + '</em>') terminfo = [] for term in nearest_terms: if t == 'w': hits = get_nbhits_in_bibwords(term, f) else: if index_id: hits = get_nbhits_in_idxphrases(term, f) elif f == 'datecreated' or f == 'datemodified': hits = get_nbhits_in_bibrec(term, f) else: hits = get_nbhits_in_bibxxx(term, f) argd = {} argd.update(urlargd) # check which fields contained the requested parameter, and replace it. for (px, fx) in ('p', 'f'), ('p1', 'f1'), ('p2', 'f2'), ('p3', 'f3'): if px in argd: argd_px = argd[px] if t == 'w': # p was stripped of accents, to do the same: argd_px = strip_accents(argd_px) if f == argd[fx] or f == "anyfield" or f == "": if string.find(argd_px, p) > -1: argd[px] = string.replace(argd_px, p, term) break else: if string.find(argd_px, f+':'+p) > -1: argd[px] = string.replace(argd_px, f+':'+p, f+':'+term) break elif string.find(argd_px, f+':"'+p+'"') > -1: argd[px] = string.replace(argd_px, f+':"'+p+'"', f+':"'+term+'"') break terminfo.append((term, hits, argd)) intro = "" if intro_text_p: # add full leading introductory text if f: intro = _("Search term %(x_term)s inside index %(x_index)s did not match any record. Nearest terms in any collection are:") % \ {'x_term': "<em>" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "</em>", 'x_index': "<em>" + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + "</em>"} else: intro = _("Search term %s did not match any record. Nearest terms in any collection are:") % \ ("<em>" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "</em>") return websearch_templates.tmpl_nearest_term_box(p=p, ln=ln, f=f, terminfo=terminfo, intro=intro) def get_nearest_terms_in_bibwords(p, f, n_below, n_above): """Return list of +n -n nearest terms to word `p' in index for field `f'.""" nearest_words = [] # will hold the (sorted) list of nearest words to return # deduce into which bibwordsX table we will search: bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: bibwordsX = "idxWORD%02dF" % index_id else: return nearest_words # firstly try to get `n' closest words above `p': res = run_sql("SELECT term FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % bibwordsX, (p, n_above)) for row in res: nearest_words.append(row[0]) nearest_words.reverse() # secondly insert given word `p': nearest_words.append(p) # finally try to get `n' closest words below `p': res = run_sql("SELECT term FROM %s WHERE term>%%s ORDER BY term ASC LIMIT %%s" % bibwordsX, (p, n_below)) for row in res: nearest_words.append(row[0]) return nearest_words def get_nearest_terms_in_idxphrase(p, index_id, n_below, n_above): """Browse (-n_above, +n_below) closest bibliographic phrases for the given pattern p in the given field idxPHRASE table, regardless of collection. Return list of [phrase1, phrase2, ... , phrase_n].""" if CFG_INSPIRE_SITE and index_id == 3: # FIXME: workaround due to new fuzzy index return [p,] idxphraseX = "idxPHRASE%02dF" % index_id res_above = run_sql("SELECT term FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % idxphraseX, (p, n_above)) res_above = map(lambda x: x[0], res_above) res_above.reverse() res_below = run_sql("SELECT term FROM %s WHERE term>=%%s ORDER BY term ASC LIMIT %%s" % idxphraseX, (p, n_below)) res_below = map(lambda x: x[0], res_below) return res_above + res_below def get_nearest_terms_in_idxphrase_with_collection(p, index_id, n_below, n_above, collection): """Browse (-n_above, +n_below) closest bibliographic phrases for the given pattern p in the given field idxPHRASE table, considering the collection (HitSet). Return list of [(phrase1, hitset), (phrase2, hitset), ... , (phrase_n, hitset)].""" idxphraseX = "idxPHRASE%02dF" % index_id res_above = run_sql("SELECT term,hitlist FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % idxphraseX, (p, n_above * 3)) res_above = [(term, HitSet(hitlist) & collection) for term, hitlist in res_above] res_above = [(term, len(hitlist)) for term, hitlist in res_above if hitlist] res_below = run_sql("SELECT term,hitlist FROM %s WHERE term>=%%s ORDER BY term ASC LIMIT %%s" % idxphraseX, (p, n_below * 3)) res_below = [(term, HitSet(hitlist) & collection) for term, hitlist in res_below] res_below = [(term, len(hitlist)) for term, hitlist in res_below if hitlist] res_above.reverse() return res_above[-n_above:] + res_below[:n_below] def get_nearest_terms_in_bibxxx(p, f, n_below, n_above): """Browse (-n_above, +n_below) closest bibliographic phrases for the given pattern p in the given field f, regardless of collection. Return list of [phrase1, phrase2, ... , phrase_n].""" ## determine browse field: if not f and string.find(p, ":") > 0: # does 'p' contain ':'? f, p = string.split(p, ":", 1) # FIXME: quick hack for the journal index if f == 'journal': return get_nearest_terms_in_bibwords(p, f, n_below, n_above) ## We are going to take max(n_below, n_above) as the number of ## values to ferch from bibXXx. This is needed to work around ## MySQL UTF-8 sorting troubles in 4.0.x. Proper solution is to ## use MySQL 4.1.x or our own idxPHRASE in the future. index_id = get_index_id_from_field(f) if index_id: return get_nearest_terms_in_idxphrase(p, index_id, n_below, n_above) n_fetch = 2*max(n_below, n_above) ## construct 'tl' which defines the tag list (MARC tags) to search in: tl = [] if str(f[0]).isdigit() and str(f[1]).isdigit(): tl.append(f) # 'f' seems to be okay as it starts by two digits else: # deduce desired MARC tags on the basis of chosen 'f' tl = get_field_tags(f) ## start browsing to fetch list of hits: browsed_phrases = {} # will hold {phrase1: 1, phrase2: 1, ..., phraseN: 1} dict of browsed phrases (to make them unique) # always add self to the results set: browsed_phrases[p.startswith("%") and p.endswith("%") and p[1:-1] or p] = 1 for t in tl: # deduce into which bibxxx table we will search: digit1, digit2 = int(t[0]), int(t[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) # firstly try to get `n' closest phrases above `p': if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value<%%s AND bx.tag LIKE %%s ORDER BY bx.value DESC LIMIT %%s""" % bx, (p, t + "%", n_fetch)) else: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value<%%s AND bx.tag=%%s ORDER BY bx.value DESC LIMIT %%s""" % bx, (p, t, n_fetch)) for row in res: browsed_phrases[row[0]] = 1 # secondly try to get `n' closest phrases equal to or below `p': if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value>=%%s AND bx.tag LIKE %%s ORDER BY bx.value ASC LIMIT %%s""" % bx, (p, t + "%", n_fetch)) else: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value>=%%s AND bx.tag=%%s ORDER BY bx.value ASC LIMIT %%s""" % bx, (p, t, n_fetch)) for row in res: browsed_phrases[row[0]] = 1 # select first n words only: (this is needed as we were searching # in many different tables and so aren't sure we have more than n # words right; this of course won't be needed when we shall have # one ACC table only for given field): phrases_out = browsed_phrases.keys() phrases_out.sort(lambda x, y: cmp(string.lower(strip_accents(x)), string.lower(strip_accents(y)))) # find position of self: try: idx_p = phrases_out.index(p) except: idx_p = len(phrases_out)/2 # return n_above and n_below: return phrases_out[max(0, idx_p-n_above):idx_p+n_below] def get_nearest_terms_in_bibrec(p, f, n_below, n_above): """Return list of nearest terms and counts from bibrec table. p is usually a date, and f either datecreated or datemodified. Note: below/above count is very approximative, not really respected. """ col = 'creation_date' if f == 'datemodified': col = 'modification_date' res_above = run_sql("""SELECT DATE_FORMAT(%s,'%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s') FROM bibrec WHERE %s < %%s ORDER BY %s ASC LIMIT %%s""" % (col, col, col), (p, n_above)) res_below = run_sql("""SELECT DATE_FORMAT(%s,'%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s') FROM bibrec WHERE %s > %%s ORDER BY %s ASC LIMIT %%s""" % (col, col, col), (p, n_below)) out = set([]) for row in res_above: out.add(row[0]) for row in res_below: out.add(row[0]) return list(out) def get_nbhits_in_bibrec(term, f): """Return number of hits in bibrec table. term is usually a date, and f is either 'datecreated' or 'datemodified'.""" col = 'creation_date' if f == 'datemodified': col = 'modification_date' res = run_sql("SELECT COUNT(*) FROM bibrec WHERE %s LIKE %%s" % (col,), (term + '%',)) return res[0][0] def get_nbhits_in_bibwords(word, f): """Return number of hits for word 'word' inside words index for field 'f'.""" out = 0 # deduce into which bibwordsX table we will search: bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: bibwordsX = "idxWORD%02dF" % index_id else: return 0 if word: res = run_sql("SELECT hitlist FROM %s WHERE term=%%s" % bibwordsX, (word,)) for hitlist in res: out += len(HitSet(hitlist[0])) return out def get_nbhits_in_idxphrases(word, f): """Return number of hits for word 'word' inside phrase index for field 'f'.""" out = 0 # deduce into which bibwordsX table we will search: idxphraseX = "idxPHRASE%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: idxphraseX = "idxPHRASE%02dF" % index_id else: return 0 if word: res = run_sql("SELECT hitlist FROM %s WHERE term=%%s" % idxphraseX, (word,)) for hitlist in res: out += len(HitSet(hitlist[0])) return out def get_nbhits_in_bibxxx(p, f): """Return number of hits for word 'word' inside words index for field 'f'.""" ## determine browse field: if not f and string.find(p, ":") > 0: # does 'p' contain ':'? f, p = string.split(p, ":", 1) # FIXME: quick hack for the journal index if f == 'journal': return get_nbhits_in_bibwords(p, f) ## construct 'tl' which defines the tag list (MARC tags) to search in: tl = [] if str(f[0]).isdigit() and str(f[1]).isdigit(): tl.append(f) # 'f' seems to be okay as it starts by two digits else: # deduce desired MARC tags on the basis of chosen 'f' tl = get_field_tags(f) # start searching: recIDs = {} # will hold dict of {recID1: 1, recID2: 1, ..., } (unique recIDs, therefore) for t in tl: # deduce into which bibxxx table we will search: digit1, digit2 = int(t[0]), int(t[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bibx.id_bibrec FROM %s AS bibx, %s AS bx WHERE bx.value=%%s AND bx.tag LIKE %%s AND bibx.id_bibxxx=bx.id""" % (bibx, bx), (p, t + "%")) else: res = run_sql("""SELECT bibx.id_bibrec FROM %s AS bibx, %s AS bx WHERE bx.value=%%s AND bx.tag=%%s AND bibx.id_bibxxx=bx.id""" % (bibx, bx), (p, t)) for row in res: recIDs[row[0]] = 1 return len(recIDs) def get_mysql_recid_from_aleph_sysno(sysno): """Returns DB's recID for ALEPH sysno passed in the argument (e.g. "002379334CER"). Returns None in case of failure.""" out = None res = run_sql("""SELECT bb.id_bibrec FROM bibrec_bib97x AS bb, bib97x AS b WHERE b.value=%s AND b.tag='970__a' AND bb.id_bibxxx=b.id""", (sysno,)) if res: out = res[0][0] return out def guess_primary_collection_of_a_record(recID): """Return primary collection name a record recid belongs to, by testing 980 identifier. May lead to bad guesses when a collection is defined dynamically via dbquery. In that case, return 'CFG_SITE_NAME'.""" out = CFG_SITE_NAME dbcollids = get_fieldvalues(recID, "980__a") if dbcollids: dbquery = "collection:" + dbcollids[0] res = run_sql("SELECT name FROM collection WHERE dbquery=%s", (dbquery,)) if res: out = res[0][0] if CFG_CERN_SITE: # dirty hack for ATLAS collections at CERN: if out in ('ATLAS Communications', 'ATLAS Internal Notes'): for alternative_collection in ('ATLAS Communications Physics', 'ATLAS Communications General', 'ATLAS Internal Notes Physics', 'ATLAS Internal Notes General',): if recID in get_collection_reclist(alternative_collection): out = alternative_collection break return out _re_collection_url = re.compile('/collection/(.+)') def guess_collection_of_a_record(recID, referer=None): """Return collection name a record recid belongs to, by first testing the referer URL if provided and otherwise returning the primary collection.""" if referer: dummy, hostname, path, dummy, query, dummy = urlparse.urlparse(referer) g = _re_collection_url.match(path) if g: name = urllib.unquote_plus(g.group(1)) if recID in get_collection_reclist(name): return name elif path.startswith('/search'): query = cgi.parse_qs(query) for name in query.get('cc', []) + query.get('c', []): if recID in get_collection_reclist(name): return name return guess_primary_collection_of_a_record(recID) def get_all_collections_of_a_record(recID): """Return all the collection names a record belongs to. Note this function is O(n_collections).""" ret = [] for name in collection_reclist_cache.cache.keys(): if recID in get_collection_reclist(name): ret.append(name) return ret def get_tag_name(tag_value, prolog="", epilog=""): """Return tag name from the known tag value, by looking up the 'tag' table. Return empty string in case of failure. Example: input='100__%', output=first author'.""" out = "" res = run_sql_cached("SELECT name FROM tag WHERE value=%s", (tag_value,), affected_tables=['tag',]) if res: out = prolog + res[0][0] + epilog return out def get_fieldcodes(): """Returns a list of field codes that may have been passed as 'search options' in URL. Example: output=['subject','division'].""" out = [] res = run_sql_cached("SELECT DISTINCT(code) FROM field", affected_tables=['field',]) for row in res: out.append(row[0]) return out def get_field_name(code): """Return the corresponding field_name given the field code. e.g. reportnumber -> report number.""" res = run_sql_cached("SELECT name FROM field WHERE code=%s", (code, ), affected_tables=['field',]) if res: return res[0][0] else: return "" def get_field_tags(field): """Returns a list of MARC tags for the field code 'field'. Returns empty list in case of error. Example: field='author', output=['100__%','700__%'].""" out = [] query = """SELECT t.value FROM tag AS t, field_tag AS ft, field AS f WHERE f.code=%s AND ft.id_field=f.id AND t.id=ft.id_tag ORDER BY ft.score DESC""" res = run_sql(query, (field, )) for val in res: out.append(val[0]) return out def get_fieldvalues(recIDs, tag, repetitive_values=True): """ Return list of field values for field TAG for the given record ID or list of record IDs. (RECIDS can be both an integer or a list of integers.) If REPETITIVE_VALUES is set to True, then return all values even if they are doubled. If set to False, then return unique values only. """ out = [] if isinstance(recIDs, (int, long)): recIDs =[recIDs,] if not isinstance(recIDs, (list, tuple)): return [] if len(recIDs) == 0: return [] if tag == "001___": # we have asked for tag 001 (=recID) that is not stored in bibXXx tables out = [str(recID) for recID in recIDs] else: # we are going to look inside bibXXx tables digits = tag[0:2] try: intdigits = int(digits) if intdigits < 0 or intdigits > 99: raise ValueError except ValueError: # invalid tag value asked for return [] bx = "bib%sx" % digits bibx = "bibrec_bib%sx" % digits queryparam = [] for recID in recIDs: queryparam.append(recID) if not repetitive_values: queryselect = "DISTINCT(bx.value)" else: queryselect = "bx.value" query = "SELECT %s FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec IN (%s) " \ " AND bx.id=bibx.id_bibxxx AND bx.tag LIKE %%s " \ " ORDER BY bibx.field_number, bx.tag ASC" % \ (queryselect, bx, bibx, ("%s,"*len(queryparam))[:-1]) res = run_sql(query, tuple(queryparam) + (tag,)) for row in res: out.append(row[0]) return out def get_fieldvalues_alephseq_like(recID, tags_in, can_see_hidden=False): """Return buffer of ALEPH sequential-like textual format with fields found in the list TAGS_IN for record RECID. If can_see_hidden is True, just print everything. Otherwise hide fields from CFG_BIBFORMAT_HIDDEN_TAGS. """ out = "" if type(tags_in) is not list: tags_in = [tags_in,] if len(tags_in) == 1 and len(tags_in[0]) == 6: ## case A: one concrete subfield asked, so print its value if found ## (use with care: can mislead if field has multiple occurrences) out += string.join(get_fieldvalues(recID, tags_in[0]),"\n") else: ## case B: print our "text MARC" format; works safely all the time # find out which tags to output: dict_of_tags_out = {} if not tags_in: for i in range(0, 10): for j in range(0, 10): dict_of_tags_out["%d%d%%" % (i, j)] = 1 else: for tag in tags_in: if len(tag) == 0: for i in range(0, 10): for j in range(0, 10): dict_of_tags_out["%d%d%%" % (i, j)] = 1 elif len(tag) == 1: for j in range(0, 10): dict_of_tags_out["%s%d%%" % (tag, j)] = 1 elif len(tag) < 5: dict_of_tags_out["%s%%" % tag] = 1 elif tag >= 6: dict_of_tags_out[tag[0:5]] = 1 tags_out = dict_of_tags_out.keys() tags_out.sort() # search all bibXXx tables as needed: for tag in tags_out: digits = tag[0:2] try: intdigits = int(digits) if intdigits < 0 or intdigits > 99: raise ValueError except ValueError: # invalid tag value asked for continue if tag.startswith("001") or tag.startswith("00%"): if out: out += "\n" out += "%09d %s %d" % (recID, "001__", recID) bx = "bib%sx" % digits bibx = "bibrec_bib%sx" % digits query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s"\ "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx) res = run_sql(query, (recID, str(tag)+'%')) # go through fields: field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] printme = True #check the stuff in hiddenfields if not can_see_hidden: for htag in CFG_BIBFORMAT_HIDDEN_TAGS: ltag = len(htag) samelenfield = field[0:ltag] if samelenfield == htag: printme = False if ind1 == "_": ind1 = "" if ind2 == "_": ind2 = "" # print field tag if printme: if field_number != field_number_old or field[:-1] != field_old[:-1]: if out: out += "\n" out += "%09d %s " % (recID, field[:5]) field_number_old = field_number field_old = field # print subfield value if field[0:2] == "00" and field[-1:] == "_": out += value else: out += "$$%s%s" % (field[-1:], value) return out def record_exists(recID): """Return 1 if record RECID exists. Return 0 if it doesn't exist. Return -1 if it exists but is marked as deleted. """ out = 0 res = run_sql("SELECT id FROM bibrec WHERE id=%s", (recID,), 1) if res: recID = int(recID) # record exists; now check whether it isn't marked as deleted: dbcollids = get_fieldvalues(recID, "980__%") if ("DELETED" in dbcollids) or (CFG_CERN_SITE and "DUMMY" in dbcollids): out = -1 # exists, but marked as deleted else: out = 1 # exists fine return out def record_empty(recID): """ Is this record empty, e.g. has only 001, waiting for integration? @param recID: the record identifier. @type recID: int @return: 1 if the record is empty, 0 otherwise. @rtype: int """ record = get_record(recID) if record is None or len(record) < 2: return 1 else: return 0 def record_public_p(recID): """Return 1 if the record is public, i.e. if it can be found in the Home collection. Return 0 otherwise. """ return recID in get_collection_reclist(CFG_SITE_NAME) def get_creation_date(recID, fmt="%Y-%m-%d"): "Returns the creation date of the record 'recID'." out = "" res = run_sql("SELECT DATE_FORMAT(creation_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out def get_modification_date(recID, fmt="%Y-%m-%d"): "Returns the date of last modification for the record 'recID'." out = "" res = run_sql("SELECT DATE_FORMAT(modification_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out def print_warning(req, msg, type='', prologue='<br />', epilogue='<br />'): "Prints warning message and flushes output." if req and msg: req.write(websearch_templates.tmpl_print_warning( msg = msg, type = type, prologue = prologue, epilogue = epilogue, )) return def print_search_info(p, f, sf, so, sp, rm, of, ot, collection=CFG_SITE_NAME, nb_found=-1, jrec=1, rg=10, aas=0, ln=CFG_SITE_LANG, p1="", p2="", p3="", f1="", f2="", f3="", m1="", m2="", m3="", op1="", op2="", sc=1, pl_in_url="", d1y=0, d1m=0, d1d=0, d2y=0, d2m=0, d2d=0, dt="", cpu_time=-1, middle_only=0): """Prints stripe with the information on 'collection' and 'nb_found' results and CPU time. Also, prints navigation links (beg/next/prev/end) inside the results set. If middle_only is set to 1, it will only print the middle box information (beg/netx/prev/end/etc) links. This is suitable for displaying navigation links at the bottom of the search results page.""" out = "" # sanity check: if jrec < 1: jrec = 1 if jrec > nb_found: jrec = max(nb_found-rg+1, 1) return websearch_templates.tmpl_print_search_info( ln = ln, collection = collection, aas = aas, collection_name = get_coll_i18nname(collection, ln, False), collection_id = get_colID(collection), middle_only = middle_only, rg = rg, nb_found = nb_found, sf = sf, so = so, rm = rm, of = of, ot = ot, p = p, f = f, p1 = p1, p2 = p2, p3 = p3, f1 = f1, f2 = f2, f3 = f3, m1 = m1, m2 = m2, m3 = m3, op1 = op1, op2 = op2, pl_in_url = pl_in_url, d1y = d1y, d1m = d1m, d1d = d1d, d2y = d2y, d2m = d2m, d2d = d2d, dt = dt, jrec = jrec, sc = sc, sp = sp, all_fieldcodes = get_fieldcodes(), cpu_time = cpu_time, ) def print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, collection=CFG_SITE_NAME, nb_found=-1, jrec=1, rg=10, aas=0, ln=CFG_SITE_LANG, p1="", p2="", p3="", f1="", f2="", f3="", m1="", m2="", m3="", op1="", op2="", sc=1, pl_in_url="", d1y=0, d1m=0, d1d=0, d2y=0, d2m=0, d2d=0, dt="", cpu_time=-1, middle_only=0): """Prints stripe with the information on 'collection' and 'nb_found' results and CPU time. Also, prints navigation links (beg/next/prev/end) inside the results set. If middle_only is set to 1, it will only print the middle box information (beg/netx/prev/end/etc) links. This is suitable for displaying navigation links at the bottom of the search results page.""" out = "" # sanity check: if jrec < 1: jrec = 1 if jrec > nb_found: jrec = max(nb_found-rg+1, 1) return websearch_templates.tmpl_print_hosted_search_info( ln = ln, collection = collection, aas = aas, collection_name = get_coll_i18nname(collection, ln, False), collection_id = get_colID(collection), middle_only = middle_only, rg = rg, nb_found = nb_found, sf = sf, so = so, rm = rm, of = of, ot = ot, p = p, f = f, p1 = p1, p2 = p2, p3 = p3, f1 = f1, f2 = f2, f3 = f3, m1 = m1, m2 = m2, m3 = m3, op1 = op1, op2 = op2, pl_in_url = pl_in_url, d1y = d1y, d1m = d1m, d1d = d1d, d2y = d2y, d2m = d2m, d2d = d2d, dt = dt, jrec = jrec, sc = sc, sp = sp, all_fieldcodes = get_fieldcodes(), cpu_time = cpu_time, ) def print_results_overview(req, colls, results_final_nb_total, results_final_nb, cpu_time, ln=CFG_SITE_LANG, ec=[], hosted_colls_potential_results_p=False): """Prints results overview box with links to particular collections below.""" out = "" new_colls = [] for coll in colls: new_colls.append({ 'id': get_colID(coll), 'code': coll, 'name': get_coll_i18nname(coll, ln, False), }) return websearch_templates.tmpl_print_results_overview( ln = ln, results_final_nb_total = results_final_nb_total, results_final_nb = results_final_nb, cpu_time = cpu_time, colls = new_colls, ec = ec, hosted_colls_potential_results_p = hosted_colls_potential_results_p, ) def print_hosted_results(url_and_engine, ln=CFG_SITE_LANG, of=None, req=None, no_records_found=False, search_timed_out=False, limit=CFG_EXTERNAL_COLLECTION_MAXRESULTS): """Prints the full results of a hosted collection""" if of.startswith("h"): if no_records_found: return "<br />No results found." if search_timed_out: return "<br />The search engine did not respond in time." return websearch_templates.tmpl_print_hosted_results( url_and_engine=url_and_engine, ln=ln, of=of, req=req, limit=limit ) def sort_records(req, recIDs, sort_field='', sort_order='d', sort_pattern='', verbose=0, of='hb', ln=CFG_SITE_LANG): """Sort records in 'recIDs' list according sort field 'sort_field' in order 'sort_order'. If more than one instance of 'sort_field' is found for a given record, try to choose that that is given by 'sort pattern', for example "sort by report number that starts by CERN-PS". Note that 'sort_field' can be field code like 'author' or MARC tag like '100__a' directly.""" _ = gettext_set_language(ln) ## check arguments: if not sort_field: return recIDs if len(recIDs) > CFG_WEBSEARCH_NB_RECORDS_TO_SORT: if of.startswith('h'): print_warning(req, _("Sorry, sorting is allowed on sets of up to %d records only. Using default sort order.") % CFG_WEBSEARCH_NB_RECORDS_TO_SORT, "Warning") return recIDs sort_fields = string.split(sort_field, ",") recIDs_dict = {} recIDs_out = [] ## first deduce sorting MARC tag out of the 'sort_field' argument: tags = [] for sort_field in sort_fields: if sort_field and str(sort_field[0:2]).isdigit(): # sort_field starts by two digits, so this is probably a MARC tag already tags.append(sort_field) else: # let us check the 'field' table query = """SELECT DISTINCT(t.value) FROM tag AS t, field_tag AS ft, field AS f WHERE f.code=%s AND ft.id_field=f.id AND t.id=ft.id_tag ORDER BY ft.score DESC""" res = run_sql(query, (sort_field, )) if res: for row in res: tags.append(row[0]) else: if of.startswith('h'): print_warning(req, _("Sorry, %s does not seem to be a valid sort option. Choosing title sort instead.") % cgi.escape(sort_field), "Error") tags.append("245__a") if verbose >= 3: print_warning(req, "Sorting by tags %s." % cgi.escape(repr(tags))) if sort_pattern: print_warning(req, "Sorting preferentially by %s." % cgi.escape(sort_pattern)) ## check if we have sorting tag defined: if tags: # fetch the necessary field values: for recID in recIDs: val = "" # will hold value for recID according to which sort vals = [] # will hold all values found in sorting tag for recID for tag in tags: vals.extend(get_fieldvalues(recID, tag)) if sort_pattern: # try to pick that tag value that corresponds to sort pattern bingo = 0 for v in vals: if v.lower().startswith(sort_pattern.lower()): # bingo! bingo = 1 val = v break if not bingo: # sort_pattern not present, so add other vals after spaces val = sort_pattern + " " + string.join(vals) else: # no sort pattern defined, so join them all together val = string.join(vals) val = strip_accents(val.lower()) # sort values regardless of accents and case if recIDs_dict.has_key(val): recIDs_dict[val].append(recID) else: recIDs_dict[val] = [recID] # sort them: recIDs_dict_keys = recIDs_dict.keys() recIDs_dict_keys.sort() # now that keys are sorted, create output array: for k in recIDs_dict_keys: for s in recIDs_dict[k]: recIDs_out.append(s) # ascending or descending? if sort_order == 'a': recIDs_out.reverse() # okay, we are done return recIDs_out else: # good, no sort needed return recIDs def print_records(req, recIDs, jrec=1, rg=10, format='hb', ot='', ln=CFG_SITE_LANG, relevances=[], relevances_prologue="(", relevances_epilogue="%%)", decompress=zlib.decompress, search_pattern='', print_records_prologue_p=True, print_records_epilogue_p=True, verbose=0, tab=''): """ Prints list of records 'recIDs' formatted according to 'format' in groups of 'rg' starting from 'jrec'. Assumes that the input list 'recIDs' is sorted in reverse order, so it counts records from tail to head. A value of 'rg=-9999' means to print all records: to be used with care. Print also list of RELEVANCES for each record (if defined), in between RELEVANCE_PROLOGUE and RELEVANCE_EPILOGUE. Print prologue and/or epilogue specific to 'format' if 'print_records_prologue_p' and/or print_records_epilogue_p' are True. """ # load the right message language _ = gettext_set_language(ln) # sanity checking: if req is None: return # get user_info (for formatting based on user) if isinstance(req, cStringIO.OutputType): user_info = {} else: user_info = collect_user_info(req) if len(recIDs): nb_found = len(recIDs) if rg == -9999: # print all records rg = nb_found else: rg = abs(rg) if jrec < 1: # sanity checks jrec = 1 if jrec > nb_found: jrec = max(nb_found-rg+1, 1) # will print records from irec_max to irec_min excluded: irec_max = nb_found - jrec irec_min = nb_found - jrec - rg if irec_min < 0: irec_min = -1 if irec_max >= nb_found: irec_max = nb_found - 1 #req.write("%s:%d-%d" % (recIDs, irec_min, irec_max)) if format.startswith('x'): # print header if needed if print_records_prologue_p: print_records_prologue(req, format) # print records recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)] format_records(recIDs_to_print, format, ln=ln, search_pattern=search_pattern, record_separator="\n", user_info=user_info, req=req) # print footer if needed if print_records_epilogue_p: print_records_epilogue(req, format) elif format.startswith('t') or str(format[0:3]).isdigit(): # we are doing plain text output: for irec in range(irec_max, irec_min, -1): x = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) req.write(x) if x: req.write('\n') elif format == 'excel': recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)] create_excel(recIDs=recIDs_to_print, req=req, ln=ln, ot=ot) else: # we are doing HTML output: if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"): # portfolio and on-the-fly formats: for irec in range(irec_max, irec_min, -1): req.write(print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose)) elif format.startswith("hb"): # HTML brief format: display_add_to_basket = True if user_info: if user_info['email'] == 'guest': if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS > 4: display_add_to_basket = False else: if not user_info['precached_usebaskets']: display_add_to_basket = False req.write(websearch_templates.tmpl_record_format_htmlbrief_header( ln = ln)) for irec in range(irec_max, irec_min, -1): row_number = jrec+irec_max-irec recid = recIDs[irec] if relevances and relevances[irec]: relevance = relevances[irec] else: relevance = '' record = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) req.write(websearch_templates.tmpl_record_format_htmlbrief_body( ln = ln, recid = recid, row_number = row_number, relevance = relevance, record = record, relevances_prologue = relevances_prologue, relevances_epilogue = relevances_epilogue, display_add_to_basket = display_add_to_basket )) req.write(websearch_templates.tmpl_record_format_htmlbrief_footer( ln = ln, display_add_to_basket = display_add_to_basket)) elif format.startswith("hd"): # HTML detailed format: for irec in range(irec_max, irec_min, -1): if record_exists(recIDs[irec]) == -1: print_warning(req, _("The record has been deleted.")) continue unordered_tabs = get_detailed_page_tabs(get_colID(guess_primary_collection_of_a_record(recIDs[irec])), recIDs[irec], ln=ln) ordered_tabs_id = [(tab_id, values['order']) for (tab_id, values) in unordered_tabs.iteritems()] ordered_tabs_id.sort(lambda x,y: cmp(x[1],y[1])) link_ln = '' if ln != CFG_SITE_LANG: link_ln = '?ln=%s' % ln recid = recIDs[irec] recid_to_display = recid # Record ID used to build the URL. if CFG_WEBSEARCH_USE_ALEPH_SYSNOS: try: recid_to_display = get_fieldvalues(recid, CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG)[0] except IndexError: # No external sysno is available, keep using # internal recid. pass citedbynum = 0 #num of citations, to be shown in the cit tab references = -1 #num of references citedbynum = get_cited_by_count(recid) reftag = "" reftags = get_field_tags("reference") if reftags: reftag = reftags[0] tmprec = get_record(recid) if reftag and len(reftag) > 4: references = len(record_get_field_instances(tmprec, reftag[0:3], reftag[3], reftag[4])) tabs = [(unordered_tabs[tab_id]['label'], \ '%s/record/%s/%s%s' % (CFG_SITE_URL, recid_to_display, tab_id, link_ln), \ tab_id == tab, unordered_tabs[tab_id]['enabled']) \ for (tab_id, order) in ordered_tabs_id if unordered_tabs[tab_id]['visible'] == True] # load content if tab == 'usage': req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln, citationnum=citedbynum, referencenum=references)) r = calculate_reading_similarity_list(recIDs[irec], "downloads") downloadsimilarity = None downloadhistory = None #if r: # downloadsimilarity = r if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS: downloadhistory = create_download_history_graph_and_box(recIDs[irec], ln) r = calculate_reading_similarity_list(recIDs[irec], "pageviews") viewsimilarity = None if r: viewsimilarity = r content = websearch_templates.tmpl_detailed_record_statistics(recIDs[irec], ln, downloadsimilarity=downloadsimilarity, downloadhistory=downloadhistory, viewsimilarity=viewsimilarity) req.write(content) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln)) elif tab == 'citations': recid = recIDs[irec] req.write(webstyle_templates.detailed_record_container_top(recid, tabs, ln, citationnum=citedbynum, referencenum=references)) req.write(websearch_templates.tmpl_detailed_record_citations_prologue(recid, ln)) # Citing citinglist = calculate_cited_by_list(recid) req.write(websearch_templates.tmpl_detailed_record_citations_citing_list(recid, ln, citinglist=citinglist)) # Self-cited selfcited = get_self_cited_by(recid) req.write(websearch_templates.tmpl_detailed_record_citations_self_cited(recid, ln, selfcited=selfcited, citinglist=citinglist)) # Co-cited s = calculate_co_cited_with_list(recid) cociting = None if s: cociting = s req.write(websearch_templates.tmpl_detailed_record_citations_co_citing(recid, ln, cociting=cociting)) # Citation history, if needed citationhistory = None if citinglist: citationhistory = create_citation_history_graph_and_box(recid, ln) #debug if verbose > 3: print_warning(req, "Citation graph debug: " + \ str(len(citationhistory))) req.write(websearch_templates.tmpl_detailed_record_citations_citation_history(recid, ln, citationhistory)) req.write(websearch_templates.tmpl_detailed_record_citations_epilogue(recid, ln)) req.write(webstyle_templates.detailed_record_container_bottom(recid, tabs, ln)) elif tab == 'references': req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln, citationnum=citedbynum, referencenum=references)) req.write(format_record(recIDs[irec], 'HDREF', ln=ln, user_info=user_info, verbose=verbose)) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln)) elif tab == 'keywords': from invenio.bibclassify_webinterface import \ record_get_keywords, get_sorting_options, \ generate_keywords, get_keywords_body from invenio.webinterface_handler import wash_urlargd form = req.form argd = wash_urlargd(form, { 'generate': (str, 'no'), 'sort': (str, 'occurrences'), 'type': (str, 'tagcloud'), 'numbering': (str, 'off'), }) recid = recIDs[irec] req.write(webstyle_templates.detailed_record_container_top(recid, tabs, ln, citationnum=citedbynum, referencenum=references)) if argd['generate'] == 'yes': # The user asked to generate the keywords. keywords = generate_keywords(req, recid) else: # Get the keywords contained in the MARC. keywords = record_get_keywords(recid, argd) if keywords: req.write(get_sorting_options(argd, keywords)) elif argd['sort'] == 'related' and not keywords: req.write('You may want to run BibIndex.') # Output the keywords or the generate button. get_keywords_body(keywords, req, recid, argd) req.write(webstyle_templates.detailed_record_container_bottom(recid, tabs, ln)) else: # Metadata tab req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln, show_short_rec_p=False, citationnum=citedbynum, referencenum=references)) creationdate = None modificationdate = None if record_exists(recIDs[irec]) == 1: creationdate = get_creation_date(recIDs[irec]) modificationdate = get_modification_date(recIDs[irec]) content = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) content = websearch_templates.tmpl_detailed_record_metadata( recID = recIDs[irec], ln = ln, format = format, creationdate = creationdate, modificationdate = modificationdate, content = content) req.write(content) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln, creationdate=creationdate, modificationdate=modificationdate, show_short_rec_p=False)) if len(tabs) > 0: # Add the mini box at bottom of the page if CFG_WEBCOMMENT_ALLOW_REVIEWS: from invenio.webcomment import get_mini_reviews reviews = get_mini_reviews(recid = recIDs[irec], ln=ln) else: reviews = '' actions = format_record(recIDs[irec], 'HDACT', ln=ln, user_info=user_info, verbose=verbose) files = format_record(recIDs[irec], 'HDFILE', ln=ln, user_info=user_info, verbose=verbose) req.write(webstyle_templates.detailed_record_mini_panel(recIDs[irec], ln, format, files=files, reviews=reviews, actions=actions)) else: # Other formats for irec in range(irec_max, irec_min, -1): req.write(print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose)) else: print_warning(req, _("Use different search terms.")) def print_records_prologue(req, format): """ Print the appropriate prologue for list of records in the given format. """ prologue = "" # no prologue needed for HTML or Text formats if format.startswith('xm'): prologue = websearch_templates.tmpl_xml_marc_prologue() elif format.startswith('xn'): prologue = websearch_templates.tmpl_xml_nlm_prologue() elif format.startswith('xw'): prologue = websearch_templates.tmpl_xml_refworks_prologue() elif format.startswith('xr'): prologue = websearch_templates.tmpl_xml_rss_prologue() elif format.startswith('xe'): prologue = websearch_templates.tmpl_xml_endnote_prologue() elif format.startswith('xo'): prologue = websearch_templates.tmpl_xml_mods_prologue() elif format.startswith('x'): prologue = websearch_templates.tmpl_xml_default_prologue() req.write(prologue) def print_records_epilogue(req, format): """ Print the appropriate epilogue for list of records in the given format. """ epilogue = "" # no epilogue needed for HTML or Text formats if format.startswith('xm'): epilogue = websearch_templates.tmpl_xml_marc_epilogue() elif format.startswith('xn'): epilogue = websearch_templates.tmpl_xml_nlm_epilogue() elif format.startswith('xw'): epilogue = websearch_templates.tmpl_xml_refworks_epilogue() elif format.startswith('xr'): epilogue = websearch_templates.tmpl_xml_rss_epilogue() elif format.startswith('xe'): epilogue = websearch_templates.tmpl_xml_endnote_epilogue() elif format.startswith('xo'): epilogue = websearch_templates.tmpl_xml_mods_epilogue() elif format.startswith('x'): epilogue = websearch_templates.tmpl_xml_default_epilogue() req.write(epilogue) def get_record(recid): """Directly the record object corresponding to the recid.""" if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE: value = run_sql("SELECT value FROM bibfmt WHERE id_bibrec=%s AND FORMAT='recstruct'", (recid, )) if value: try: return deserialize_via_marshal(value[0][0]) except: ### In case of corruption, let's rebuild it! pass return create_record(print_record(recid, 'xm'))[0] def print_record(recID, format='hb', ot='', ln=CFG_SITE_LANG, decompress=zlib.decompress, search_pattern=None, user_info=None, verbose=0): """Prints record 'recID' formatted according to 'format'.""" if format == 'recstruct': return get_record(recID) _ = gettext_set_language(ln) #check from user information if the user has the right to see hidden fields/tags in the #records as well can_see_hidden = (acc_authorize_action(user_info, 'runbibedit')[0] == 0) out = "" # sanity check: record_exist_p = record_exists(recID) if record_exist_p == 0: # doesn't exist return out # New Python BibFormat procedure for formatting # Old procedure follows further below # We must still check some special formats, but these # should disappear when BibFormat improves. if not (CFG_BIBFORMAT_USE_OLD_BIBFORMAT \ or format.lower().startswith('t') \ or format.lower().startswith('hm') \ or str(format[0:3]).isdigit() \ or ot): # Unspecified format is hd if format == '': format = 'hd' if record_exist_p == -1 and get_output_format_content_type(format) == 'text/html': # HTML output displays a default value for deleted records. # Other format have to deal with it. out += _("The record has been deleted.") else: out += call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) # at the end of HTML brief mode, print the "Detailed record" functionality: if format.lower().startswith('hb') and \ format.lower() != 'hb_p': out += websearch_templates.tmpl_print_record_brief_links( ln = ln, recID = recID, ) return out # Old PHP BibFormat procedure for formatting # print record opening tags, if needed: if format == "marcxml" or format == "oai_dc": out += " <record>\n" out += " <header>\n" for oai_id in get_fieldvalues(recID, CFG_OAI_ID_FIELD): out += " <identifier>%s</identifier>\n" % oai_id out += " <datestamp>%s</datestamp>\n" % get_modification_date(recID) out += " </header>\n" out += " <metadata>\n" if format.startswith("xm") or format == "marcxml": # look for detailed format existence: query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s" res = run_sql(query, (recID, format), 1) if res and record_exist_p == 1: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format' -- they are not in "bibfmt" table; so fetch all the data from "bibXXx" tables: if format == "marcxml": out += """ <record xmlns="http://www.loc.gov/MARC21/slim">\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int(recID) elif format.startswith("xm"): out += """ <record>\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int(recID) if record_exist_p == -1: # deleted record, so display only OAI ID and 980: oai_ids = get_fieldvalues(recID, CFG_OAI_ID_FIELD) if oai_ids: out += "<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\"><subfield code=\"%s\">%s</subfield></datafield>\n" % \ (CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4], CFG_OAI_ID_FIELD[4:5], CFG_OAI_ID_FIELD[5:6], oai_ids[0]) out += "<datafield tag=\"980\" ind1=\"\" ind2=\"\"><subfield code=\"c\">DELETED</subfield></datafield>\n" else: # controlfields query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\ "WHERE bb.id_bibrec=%s AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\ "ORDER BY bb.field_number, b.tag ASC" res = run_sql(query, (recID, )) for row in res: field, value = row[0], row[1] value = encode_for_xml(value) out += """ <controlfield tag="%s" >%s</controlfield>\n""" % \ (encode_for_xml(field[0:3]), value) # datafields i = 1 # Do not process bib00x and bibrec_bib00x, as # they are controlfields. So start at bib01x and # bibrec_bib00x (and set i = 0 at the end of # first loop) for digit1 in range(0, 10): for digit2 in range(i, 10): bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s"\ "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx) res = run_sql(query, (recID, str(digit1)+str(digit2)+'%')) field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] if ind1 == "_" or ind1 == "": ind1 = " " if ind2 == "_" or ind2 == "": ind2 = " " # print field tag, unless hidden printme = True if not can_see_hidden: for htag in CFG_BIBFORMAT_HIDDEN_TAGS: ltag = len(htag) samelenfield = field[0:ltag] if samelenfield == htag: printme = False if printme: if field_number != field_number_old or field[:-1] != field_old[:-1]: if field_number_old != -999: out += """ </datafield>\n""" out += """ <datafield tag="%s" ind1="%s" ind2="%s">\n""" % \ (encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2)) field_number_old = field_number field_old = field # print subfield value value = encode_for_xml(value) out += """ <subfield code="%s">%s</subfield>\n""" % \ (encode_for_xml(field[-1:]), value) # all fields/subfields printed in this run, so close the tag: if field_number_old != -999: out += """ </datafield>\n""" i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x # we are at the end of printing the record: out += " </record>\n" elif format == "xd" or format == "oai_dc": # XML Dublin Core format, possibly OAI -- select only some bibXXx fields: out += """ <dc xmlns="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://www.openarchives.org/OAI/1.1/dc.xsd">\n""" if record_exist_p == -1: out += "" else: for f in get_fieldvalues(recID, "041__a"): out += " <language>%s</language>\n" % f for f in get_fieldvalues(recID, "100__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "700__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "245__a"): out += " <title>%s\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "65017a"): out += " %s\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "8564_u"): out += " %s\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "520__a"): out += " %s\n" % encode_for_xml(f) out += " %s\n" % get_creation_date(recID) out += " \n" elif len(format) == 6 and str(format[0:3]).isdigit(): # user has asked to print some fields only if format == "001": out += "%s\n" % (format, recID, format) else: vals = get_fieldvalues(recID, format) for val in vals: out += "%s\n" % (format, val, format) elif format.startswith('t'): ## user directly asked for some tags to be displayed only if record_exist_p == -1: out += get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"], can_see_hidden) else: out += get_fieldvalues_alephseq_like(recID, ot, can_see_hidden) elif format == "hm": if record_exist_p == -1: out += "\n
" + cgi.escape(get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"], can_see_hidden)) + "
" else: out += "\n
" + cgi.escape(get_fieldvalues_alephseq_like(recID, ot, can_see_hidden)) + "
" elif format.startswith("h") and ot: ## user directly asked for some tags to be displayed only if record_exist_p == -1: out += "\n
" + get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"], can_see_hidden) + "
" else: out += "\n
" + get_fieldvalues_alephseq_like(recID, ot, can_see_hidden) + "
" elif format == "hd": # HTML detailed format if record_exist_p == -1: out += _("The record has been deleted.") else: # look for detailed format existence: query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s" res = run_sql(query, (recID, format), 1) if res: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format', so try to call BibFormat on the fly or use default format: out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) if out_record_in_format: out += out_record_in_format else: out += websearch_templates.tmpl_print_record_detailed( ln = ln, recID = recID, ) elif format.startswith("hb_") or format.startswith("hd_"): # underscore means that HTML brief/detailed formats should be called on-the-fly; suitable for testing formats if record_exist_p == -1: out += _("The record has been deleted.") else: out += call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) elif format.startswith("hx"): # BibTeX format, called on the fly: if record_exist_p == -1: out += _("The record has been deleted.") else: out += call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) elif format.startswith("hs"): # for citation/download similarity navigation links: if record_exist_p == -1: out += _("The record has been deleted.") else: out += '' % websearch_templates.build_search_url(recid=recID, ln=ln) # firstly, title: titles = get_fieldvalues(recID, "245__a") if titles: for title in titles: out += "%s" % title else: # usual title not found, try conference title: titles = get_fieldvalues(recID, "111__a") if titles: for title in titles: out += "%s" % title else: # just print record ID: out += "%s %d" % (get_field_i18nname("record ID", ln, False), recID) out += "" # secondly, authors: authors = get_fieldvalues(recID, "100__a") + get_fieldvalues(recID, "700__a") if authors: out += " - %s" % authors[0] if len(authors) > 1: out += " et al" # thirdly publication info: publinfos = get_fieldvalues(recID, "773__s") if not publinfos: publinfos = get_fieldvalues(recID, "909C4s") if not publinfos: publinfos = get_fieldvalues(recID, "037__a") if not publinfos: publinfos = get_fieldvalues(recID, "088__a") if publinfos: out += " - %s" % publinfos[0] else: # fourthly publication year (if not publication info): years = get_fieldvalues(recID, "773__y") if not years: years = get_fieldvalues(recID, "909C4y") if not years: years = get_fieldvalues(recID, "260__c") if years: out += " (%s)" % years[0] else: # HTML brief format by default if record_exist_p == -1: out += _("The record has been deleted.") else: query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s" res = run_sql(query, (recID, format)) if res: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format', so try to call BibFormat on the fly: or use default format: if CFG_WEBSEARCH_CALL_BIBFORMAT: out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) if out_record_in_format: out += out_record_in_format else: out += websearch_templates.tmpl_print_record_brief( ln = ln, recID = recID, ) else: out += websearch_templates.tmpl_print_record_brief( ln = ln, recID = recID, ) # at the end of HTML brief mode, print the "Detailed record" functionality: if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"): pass # do nothing for portfolio and on-the-fly formats else: out += websearch_templates.tmpl_print_record_brief_links( ln = ln, recID = recID, ) # print record closing tags, if needed: if format == "marcxml" or format == "oai_dc": out += " \n" out += " \n" return out def call_bibformat(recID, format="HD", ln=CFG_SITE_LANG, search_pattern=None, user_info=None, verbose=0): """ Calls BibFormat and returns formatted record. BibFormat will decide by itself if old or new BibFormat must be used. """ from invenio.bibformat_utils import get_pdf_snippets keywords = [] if search_pattern is not None: units = create_basic_search_units(None, str(search_pattern), None) keywords = [unit[1] for unit in units if unit[0] != '-'] out = format_record(recID, of=format, ln=ln, search_pattern=keywords, user_info=user_info, verbose=verbose) if CFG_WEBSEARCH_FULLTEXT_SNIPPETS and user_info and \ 'fulltext' in user_info['uri']: # check snippets only if URL contains fulltext # FIXME: make it work for CLI too, via new function arg if keywords: snippets = get_pdf_snippets(recID, keywords) if snippets: out += snippets return out def log_query(hostname, query_args, uid=-1): """ Log query into the query and user_query tables. Return id_query or None in case of problems. """ id_query = None if uid >= 0: # log the query only if uid is reasonable res = run_sql("SELECT id FROM query WHERE urlargs=%s", (query_args,), 1) try: id_query = res[0][0] except: id_query = run_sql("INSERT INTO query (type, urlargs) VALUES ('r', %s)", (query_args,)) if id_query: run_sql("INSERT INTO user_query (id_user, id_query, hostname, date) VALUES (%s, %s, %s, %s)", (uid, id_query, hostname, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) return id_query def log_query_info(action, p, f, colls, nb_records_found_total=-1): """Write some info to the log file for later analysis.""" try: log = open(CFG_LOGDIR + "/search.log", "a") log.write(time.strftime("%Y%m%d%H%M%S#", time.localtime())) log.write(action+"#") log.write(p+"#") log.write(f+"#") for coll in colls[:-1]: log.write("%s," % coll) log.write("%s#" % colls[-1]) log.write("%d" % nb_records_found_total) log.write("\n") log.close() except: pass return ### CALLABLES def perform_request_search(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS, sf="", so="d", sp="", rm="", of="id", ot="", aas=0, p1="", f1="", m1="", op1="", p2="", f2="", m2="", op2="", p3="", f3="", m3="", sc=0, jrec=0, recid=-1, recidb=-1, sysno="", id=-1, idb=-1, sysnb="", action="", d1="", d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0, dt="", verbose=0, ap=0, ln=CFG_SITE_LANG, ec=None, tab=""): """Perform search or browse request, without checking for authentication. Return list of recIDs found, if of=id. Otherwise create web page. The arguments are as follows: req - mod_python Request class instance. cc - current collection (e.g. "ATLAS"). The collection the user started to search/browse from. c - collection list (e.g. ["Theses", "Books"]). The collections user may have selected/deselected when starting to search from 'cc'. p - pattern to search for (e.g. "ellis and muon or kaon"). f - field to search within (e.g. "author"). rg - records in groups of (e.g. "10"). Defines how many hits per collection in the search results page are displayed. sf - sort field (e.g. "title"). so - sort order ("a"=ascending, "d"=descending). sp - sort pattern (e.g. "CERN-") -- in case there are more values in a sort field, this argument tells which one to prefer rm - ranking method (e.g. "jif"). Defines whether results should be ranked by some known ranking method. of - output format (e.g. "hb"). Usually starting "h" means HTML output (and "hb" for HTML brief, "hd" for HTML detailed), "x" means XML output, "t" means plain text output, "id" means no output at all but to return list of recIDs found. (Suitable for high-level API.) ot - output only these MARC tags (e.g. "100,700,909C0b"). Useful if only some fields are to be shown in the output, e.g. for library to control some fields. aas - advanced search ("0" means no, "1" means yes). Whether search was called from within the advanced search interface. p1 - first pattern to search for in the advanced search interface. Much like 'p'. f1 - first field to search within in the advanced search interface. Much like 'f'. m1 - first matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). op1 - first operator, to join the first and the second unit in the advanced search interface. ("a" add, "o" or, "n" not). p2 - second pattern to search for in the advanced search interface. Much like 'p'. f2 - second field to search within in the advanced search interface. Much like 'f'. m2 - second matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). op2 - second operator, to join the second and the third unit in the advanced search interface. ("a" add, "o" or, "n" not). p3 - third pattern to search for in the advanced search interface. Much like 'p'. f3 - third field to search within in the advanced search interface. Much like 'f'. m3 - third matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). sc - split by collection ("0" no, "1" yes). Governs whether we want to present the results in a single huge list, or splitted by collection. jrec - jump to record (e.g. "234"). Used for navigation inside the search results. recid - display record ID (e.g. "20000"). Do not search/browse but go straight away to the Detailed record page for the given recID. recidb - display record ID bis (e.g. "20010"). If greater than 'recid', then display records from recid to recidb. Useful for example for dumping records from the database for reformatting. sysno - display old system SYS number (e.g. ""). If you migrate to CDS Invenio from another system, and store your old SYS call numbers, you can use them instead of recid if you wish so. id - the same as recid, in case recid is not set. For backwards compatibility. idb - the same as recid, in case recidb is not set. For backwards compatibility. sysnb - the same as sysno, in case sysno is not set. For backwards compatibility. action - action to do. "SEARCH" for searching, "Browse" for browsing. Default is to search. d1 - first datetime in full YYYY-mm-dd HH:MM:DD format (e.g. "1998-08-23 12:34:56"). Useful for search limits on creation/modification date (see 'dt' argument below). Note that 'd1' takes precedence over d1y, d1m, d1d if these are defined. d1y - first date's year (e.g. "1998"). Useful for search limits on creation/modification date. d1m - first date's month (e.g. "08"). Useful for search limits on creation/modification date. d1d - first date's day (e.g. "23"). Useful for search limits on creation/modification date. d2 - second datetime in full YYYY-mm-dd HH:MM:DD format (e.g. "1998-09-02 12:34:56"). Useful for search limits on creation/modification date (see 'dt' argument below). Note that 'd2' takes precedence over d2y, d2m, d2d if these are defined. d2y - second date's year (e.g. "1998"). Useful for search limits on creation/modification date. d2m - second date's month (e.g. "09"). Useful for search limits on creation/modification date. d2d - second date's day (e.g. "02"). Useful for search limits on creation/modification date. dt - first and second date's type (e.g. "c"). Specifies whether to search in creation dates ("c") or in modification dates ("m"). When dt is not set and d1* and d2* are set, the default is "c". verbose - verbose level (0=min, 9=max). Useful to print some internal information on the searching process in case something goes wrong. ap - alternative patterns (0=no, 1=yes). In case no exact match is found, the search engine can try alternative patterns e.g. to replace non-alphanumeric characters by a boolean query. ap defines if this is wanted. ln - language of the search interface (e.g. "en"). Useful for internationalization. ec - list of external search engines to search as well (e.g. "SPIRES HEP"). """ selected_external_collections_infos = None # wash output format: of = wash_output_format(of) # raise an exception when trying to print out html from the cli if of.startswith("h"): assert req # for every search engine request asking for an HTML output, we # first regenerate cache of collection and field I18N names if # needed; so that later we won't bother checking timestamps for # I18N names at all: if of.startswith("h"): collection_i18nname_cache.recreate_cache_if_needed() field_i18nname_cache.recreate_cache_if_needed() # wash all arguments requiring special care try: (cc, colls_to_display, colls_to_search, hosted_colls, wash_colls_debug) = wash_colls(cc, c, sc, verbose) # which colls to search and to display? except InvenioWebSearchUnknownCollectionError, exc: colname = exc.colname if of.startswith("h"): page_start(req, of, cc, aas, ln, getUid(req), websearch_templates.tmpl_collection_not_found_page_title(colname, ln)) req.write(websearch_templates.tmpl_collection_not_found_page_body(colname, ln)) return page_end(req, of, ln) elif of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) else: return page_end(req, of, ln) p = wash_pattern(p) f = wash_field(f) p1 = wash_pattern(p1) f1 = wash_field(f1) p2 = wash_pattern(p2) f2 = wash_field(f2) p3 = wash_pattern(p3) f3 = wash_field(f3) datetext1, datetext2 = wash_dates(d1, d1y, d1m, d1d, d2, d2y, d2m, d2d) # wash ranking method: if not is_method_valid(None, rm): rm = "" _ = gettext_set_language(ln) # backwards compatibility: id, idb, sysnb -> recid, recidb, sysno (if applicable) if sysnb != "" and sysno == "": sysno = sysnb if id > 0 and recid == -1: recid = id if idb > 0 and recidb == -1: recidb = idb # TODO deduce passed search limiting criterias (if applicable) pl, pl_in_url = "", "" # no limits by default if action != "browse" and req and not isinstance(req, cStringIO.OutputType) \ and req.args: # we do not want to add options while browsing or while calling via command-line fieldargs = cgi.parse_qs(req.args) for fieldcode in get_fieldcodes(): if fieldargs.has_key(fieldcode): for val in fieldargs[fieldcode]: pl += "+%s:\"%s\" " % (fieldcode, val) pl_in_url += "&%s=%s" % (urllib.quote(fieldcode), urllib.quote(val)) # deduce recid from sysno argument (if applicable): if sysno: # ALEPH SYS number was passed, so deduce DB recID for the record: recid = get_mysql_recid_from_aleph_sysno(sysno) if recid is None: recid = 0 # use recid 0 to indicate that this sysno does not exist # deduce collection we are in (if applicable): if recid > 0: referer = None if req: referer = req.headers_in.get('Referer') cc = guess_collection_of_a_record(recid, referer) # deduce user id (if applicable): try: uid = getUid(req) except: uid = 0 ## 0 - start output if recid >= 0: # recid can be 0 if deduced from sysno and if such sysno does not exist ## 1 - detailed record display title, description, keywords = \ websearch_templates.tmpl_record_page_header_content(req, recid, ln) if req is not None and not req.header_only: page_start(req, of, cc, aas, ln, uid, title, description, keywords, recid, tab) # Default format is hb but we are in detailed -> change 'of' if of == "hb": of = "hd" if record_exists(recid): if recidb <= recid: # sanity check recidb = recid + 1 if of == "id": return [recidx for recidx in range(recid, recidb) if record_exists(recidx)] else: print_records(req, range(recid, recidb), -1, -9999, of, ot, ln, search_pattern=p, verbose=verbose, tab=tab) if req and of.startswith("h"): # register detailed record page view event client_ip_address = str(req.remote_ip) register_page_view_event(recid, uid, client_ip_address) else: # record does not exist if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) elif of.startswith("h"): if req.header_only: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND else: print_warning(req, _("Requested record does not seem to exist.")) elif action == "browse": ## 2 - browse needed of = 'hb' page_start(req, of, cc, aas, ln, uid, _("Browse"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) try: if aas == 1 or (p1 or p2 or p3): browse_pattern(req, colls_to_search, p1, f1, rg, ln) browse_pattern(req, colls_to_search, p2, f2, rg, ln) browse_pattern(req, colls_to_search, p3, f3, rg, ln) else: browse_pattern(req, colls_to_search, p, f, rg, ln) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) elif rm and p.startswith("recid:"): ## 3-ter - similarity search or citation search needed if req and not req.header_only: page_start(req, of, cc, aas, ln, uid, _("Search Results"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) if of.startswith("h"): req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) if record_exists(p[6:]) != 1: # record does not exist if of.startswith("h"): if req.header_only: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND else: print_warning(req, _("Requested record does not seem to exist.")) if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: # record well exists, so find similar ones to it t1 = os.times()[4] results_similar_recIDs, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, results_similar_comments = \ rank_records(rm, 0, get_collection_reclist(cc), string.split(p), verbose) if results_similar_recIDs: t2 = os.times()[4] cpu_time = t2 - t1 if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, cc, len(results_similar_recIDs), jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) print_warning(req, results_similar_comments) print_records(req, results_similar_recIDs, jrec, rg, of, ot, ln, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, search_pattern=p, verbose=verbose) elif of=="id": return results_similar_recIDs elif of.startswith("x"): print_records(req, results_similar_recIDs, jrec, rg, of, ot, ln, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, search_pattern=p, verbose=verbose) else: # rank_records failed and returned some error message to display: if of.startswith("h"): print_warning(req, results_similar_relevances_prologue) print_warning(req, results_similar_relevances_epilogue) print_warning(req, results_similar_comments) if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) elif p.startswith("cocitedwith:"): #WAS EXPERIMENTAL ## 3-terter - cited by search needed page_start(req, of, cc, aas, ln, uid, _("Search Results"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) if of.startswith("h"): req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) recID = p[12:] if record_exists(recID) != 1: # record does not exist if of.startswith("h"): print_warning(req, _("Requested record does not seem to exist.")) if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: # record well exists, so find co-cited ones: t1 = os.times()[4] results_cocited_recIDs = map(lambda x: x[0], calculate_co_cited_with_list(int(recID))) if results_cocited_recIDs: t2 = os.times()[4] cpu_time = t2 - t1 if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, CFG_SITE_NAME, len(results_cocited_recIDs), jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) print_records(req, results_cocited_recIDs, jrec, rg, of, ot, ln, search_pattern=p, verbose=verbose) elif of=="id": return results_cocited_recIDs elif of.startswith("x"): print_records(req, results_cocited_recIDs, jrec, rg, of, ot, ln, search_pattern=p, verbose=verbose) else: # cited rank_records failed and returned some error message to display: if of.startswith("h"): print_warning(req, "nothing found") if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: ## 3 - common search needed query_in_cache = False query_representation_in_cache = repr((p,f,colls_to_search)) page_start(req, of, cc, aas, ln, uid, p=create_page_title_search_pattern_info(p, p1, p2, p3)) if of.startswith("h") and verbose and wash_colls_debug: print_warning(req, "wash_colls debugging info : %s" % wash_colls_debug) # search into the hosted collections only if the output format is html or xml if hosted_colls and (of.startswith("h") or of.startswith("x")) and not p.startswith("recid:"): # hosted_colls_results : the hosted collections' searches that did not timeout # hosted_colls_timeouts : the hosted collections' searches that timed out and will be searched later on again (hosted_colls_results, hosted_colls_timeouts) = calculate_hosted_collections_results(req, [p, p1, p2, p3], f, hosted_colls, verbose, ln, CFG_HOSTED_COLLECTION_TIMEOUT_ANTE_SEARCH) # successful searches if hosted_colls_results: hosted_colls_true_results = [] for result in hosted_colls_results: # if the number of results is None or 0 (or False) then just do nothing if result[1] == None or result[1] == False: # these are the searches the returned no or zero results if verbose: print_warning(req, "Hosted collections (perform_search_request): %s returned no results" % result[0][1].name) else: # these are the searches that actually returned results on time hosted_colls_true_results.append(result) if verbose: print_warning(req, "Hosted collections (perform_search_request): %s returned %s results in %s seconds" % (result[0][1].name, result[1], result[2])) else: if verbose: print_warning(req, "Hosted collections (perform_search_request): there were no hosted collections results to be printed at this time") if hosted_colls_timeouts: if verbose: for timeout in hosted_colls_timeouts: print_warning(req, "Hosted collections (perform_search_request): %s timed out and will be searched again later" % timeout[0][1].name) # we need to know for later use if there were any hosted collections to be searched even if they weren't in the end elif hosted_colls and ((not (of.startswith("h") or of.startswith("x"))) or p.startswith("recid:")): (hosted_colls_results, hosted_colls_timeouts) = (None, None) else: if verbose: print_warning(req, "Hosted collections (perform_search_request): there were no hosted collections to be searched") ## let's define some useful boolean variables: # True means there are actual or potential hosted collections results to be printed hosted_colls_actual_or_potential_results_p = not (not hosted_colls or not ((hosted_colls_results and hosted_colls_true_results) or hosted_colls_timeouts)) # True means there are hosted collections timeouts to take care of later # (useful for more accurate printing of results later) hosted_colls_potential_results_p = not (not hosted_colls or not hosted_colls_timeouts) # True means we only have hosted collections to deal with only_hosted_colls_actual_or_potential_results_p = not colls_to_search and hosted_colls_actual_or_potential_results_p if of.startswith("h"): req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) t1 = os.times()[4] results_in_any_collection = HitSet() if aas == 1 or (p1 or p2 or p3): ## 3A - advanced search try: results_in_any_collection = search_pattern_parenthesised(req, p1, f1, m1, ap=ap, of=of, verbose=verbose, ln=ln) if len(results_in_any_collection) == 0: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) if p2: results_tmp = search_pattern_parenthesised(req, p2, f2, m2, ap=ap, of=of, verbose=verbose, ln=ln) if op1 == "a": # add results_in_any_collection.intersection_update(results_tmp) elif op1 == "o": # or results_in_any_collection.union_update(results_tmp) elif op1 == "n": # not results_in_any_collection.difference_update(results_tmp) else: if of.startswith("h"): print_warning(req, "Invalid set operation %s." % cgi.escape(op1), "Error") if len(results_in_any_collection) == 0: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) if p3: results_tmp = search_pattern_parenthesised(req, p3, f3, m3, ap=ap, of=of, verbose=verbose, ln=ln) if op2 == "a": # add results_in_any_collection.intersection_update(results_tmp) elif op2 == "o": # or results_in_any_collection.union_update(results_tmp) elif op2 == "n": # not results_in_any_collection.difference_update(results_tmp) else: if of.startswith("h"): print_warning(req, "Invalid set operation %s." % cgi.escape(op2), "Error") except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) else: ## 3B - simple search if search_results_cache.cache.has_key(query_representation_in_cache): # query is not in the cache already, so reuse it: query_in_cache = True results_in_any_collection = search_results_cache.cache[query_representation_in_cache] if verbose and of.startswith("h"): print_warning(req, "Search stage 0: query found in cache, reusing cached results.") else: try: # added the display_nearest_terms_box parameter to avoid printing out the "Nearest terms in any collection" # recommendations when there are results only in the hosted collections. Also added the if clause to avoid # searching in case we know we only have actual or potential hosted collections results if not only_hosted_colls_actual_or_potential_results_p: results_in_any_collection = search_pattern_parenthesised(req, p, f, ap=ap, of=of, verbose=verbose, ln=ln, display_nearest_terms_box=not hosted_colls_actual_or_potential_results_p) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if len(results_in_any_collection) == 0 and not hosted_colls_actual_or_potential_results_p: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) # store this search query results into search results cache if needed: if CFG_WEBSEARCH_SEARCH_CACHE_SIZE and not query_in_cache: if len(search_results_cache.cache) > CFG_WEBSEARCH_SEARCH_CACHE_SIZE: search_results_cache.clear() search_results_cache.cache[query_representation_in_cache] = results_in_any_collection if verbose and of.startswith("h"): print_warning(req, "Search stage 3: storing query results in cache.") # search stage 4: intersection with collection universe: try: # added the display_nearest_terms_box parameter to avoid printing out the "Nearest terms in any collection" # recommendations when there results only in the hosted collections. Also added the if clause to avoid # searching in case we know since the last stage that we have no results in any collection if len(results_in_any_collection) != 0: results_final = intersect_results_with_collrecs(req, results_in_any_collection, colls_to_search, ap, of, verbose, ln, display_nearest_terms_box=not hosted_colls_actual_or_potential_results_p) else: results_final = {} except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if results_final == {} and not hosted_colls_actual_or_potential_results_p: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) if of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) # search stage 5: apply search option limits and restrictions: if datetext1 != "" and results_final != {}: if verbose and of.startswith("h"): print_warning(req, "Search stage 5: applying time etc limits, from %s until %s..." % (datetext1, datetext2)) try: results_final = intersect_results_with_hitset(req, results_final, search_unit_in_bibrec(datetext1, datetext2, dt), ap, aptext= _("No match within your time limits, " "discarding this condition..."), of=of) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if results_final == {} and not hosted_colls_actual_or_potential_results_p: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) #if of.startswith("x"): # # Print empty, but valid XML # print_records_prologue(req, of) # print_records_epilogue(req, of) return page_end(req, of, ln) if pl and results_final != {}: pl = wash_pattern(pl) if verbose and of.startswith("h"): print_warning(req, "Search stage 5: applying search pattern limit %s..." % cgi.escape(pl)) try: results_final = intersect_results_with_hitset(req, results_final, search_pattern_parenthesised(req, pl, ap=0, ln=ln), ap, aptext=_("No match within your search limits, " "discarding this condition..."), of=of) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if results_final == {} and not hosted_colls_actual_or_potential_results_p: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) if of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) t2 = os.times()[4] cpu_time = t2 - t1 ## search stage 6: display results: results_final_nb_total = 0 results_final_nb = {} # will hold number of records found in each collection # (in simple dict to display overview more easily) for coll in results_final.keys(): results_final_nb[coll] = len(results_final[coll]) #results_final_nb_total += results_final_nb[coll] # Now let us calculate results_final_nb_total more precisely, # in order to get the total number of "distinct" hits across # searched collections; this is useful because a record might # have been attributed to more than one primary collection; so # we have to avoid counting it multiple times. The price to # pay for this accuracy of results_final_nb_total is somewhat # increased CPU time. if results_final.keys() == 1: # only one collection; no need to union them results_final_for_all_selected_colls = results_final.values()[0] results_final_nb_total = results_final_nb.values()[0] else: # okay, some work ahead to union hits across collections: results_final_for_all_selected_colls = HitSet() for coll in results_final.keys(): results_final_for_all_selected_colls.union_update(results_final[coll]) results_final_nb_total = len(results_final_for_all_selected_colls) #if hosted_colls and (of.startswith("h") or of.startswith("x")): if hosted_colls_actual_or_potential_results_p: if hosted_colls_results: for result in hosted_colls_true_results: colls_to_search.append(result[0][1].name) results_final_nb[result[0][1].name] = result[1] results_final_nb_total += result[1] cpu_time += result[2] if hosted_colls_timeouts: for timeout in hosted_colls_timeouts: colls_to_search.append(timeout[1].name) # use -963 as a special number to identify the collections that timed out results_final_nb[timeout[1].name] = -963 # we continue past this point only if there is a hosted collection that has timed out and might offer potential results if results_final_nb_total ==0 and not hosted_colls_potential_results_p: if of.startswith("h"): print_warning(req, "No match found, please enter different search terms.") elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: # yes, some hits found: good! # collection list may have changed due to not-exact-match-found policy so check it out: for coll in results_final.keys(): if coll not in colls_to_search: colls_to_search.append(coll) # print results overview: if of == "id": # we have been asked to return list of recIDs recIDs = list(results_final_for_all_selected_colls) if sf: # do we have to sort? recIDs = sort_records(req, recIDs, sf, so, sp, verbose, of) elif rm: # do we have to rank? results_final_for_all_colls_rank_records_output = rank_records(rm, 0, results_final_for_all_selected_colls, string.split(p) + string.split(p1) + string.split(p2) + string.split(p3), verbose) if results_final_for_all_colls_rank_records_output[0]: recIDs = results_final_for_all_colls_rank_records_output[0] return recIDs elif of.startswith("h"): if of not in ['hcs']: # added the hosted_colls_potential_results_p parameter to help print out the overview more accurately req.write(print_results_overview(req, colls_to_search, results_final_nb_total, results_final_nb, cpu_time, ln, ec, hosted_colls_potential_results_p=hosted_colls_potential_results_p)) selected_external_collections_infos = print_external_results_overview(req, cc, [p, p1, p2, p3], f, ec, verbose, ln) # print number of hits found for XML outputs: if of.startswith("x"): req.write("\n" % results_final_nb_total) # print records: if of in ['hcs']: # feed the current search to be summarized: from invenio.search_engine_summarizer import summarize_records summarize_records(results_final_for_all_selected_colls, 'hcs', ln, p, f, req) else: if len(colls_to_search)>1: cpu_time = -1 # we do not want to have search time printed on each collection print_records_prologue(req, of) for coll in colls_to_search: if results_final.has_key(coll) and len(results_final[coll]): if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, coll, results_final_nb[coll], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) results_final_recIDs = list(results_final[coll]) results_final_relevances = [] results_final_relevances_prologue = "" results_final_relevances_epilogue = "" if sf: # do we have to sort? results_final_recIDs = sort_records(req, results_final_recIDs, sf, so, sp, verbose, of) elif rm: # do we have to rank? results_final_recIDs_ranked, results_final_relevances, results_final_relevances_prologue, results_final_relevances_epilogue, results_final_comments = \ rank_records(rm, 0, results_final[coll], string.split(p) + string.split(p1) + string.split(p2) + string.split(p3), verbose) if of.startswith("h"): print_warning(req, results_final_comments) if results_final_recIDs_ranked: results_final_recIDs = results_final_recIDs_ranked else: # rank_records failed and returned some error message to display: print_warning(req, results_final_relevances_prologue) print_warning(req, results_final_relevances_epilogue) print_records(req, results_final_recIDs, jrec, rg, of, ot, ln, results_final_relevances, results_final_relevances_prologue, results_final_relevances_epilogue, search_pattern=p, print_records_prologue_p=False, print_records_epilogue_p=False, verbose=verbose) if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, coll, results_final_nb[coll], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) #if hosted_colls and (of.startswith("h") or of.startswith("x")): if hosted_colls_actual_or_potential_results_p: if hosted_colls_results: # TODO: add a verbose message here for result in hosted_colls_true_results: if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, results_final_nb[result[0][1].name], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) req.write(print_hosted_results(url_and_engine=result[0], ln=ln, of=of, req=req, limit=rg)) if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, results_final_nb[result[0][1].name], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) if hosted_colls_timeouts: # TODO: add a verbose message here # TODO: check if verbose messages still work when dealing with (re)calculations of timeouts (hosted_colls_timeouts_results, hosted_colls_timeouts_timeouts) = do_calculate_hosted_collections_results(req, ln, None, verbose, None, hosted_colls_timeouts, CFG_HOSTED_COLLECTION_TIMEOUT_POST_SEARCH) if hosted_colls_timeouts_results: hosted_colls_timeouts_true_results = [] for result in hosted_colls_timeouts_results: if result[1] == None or result[1] == False: ## these are the searches the returned no or zero results ## also print a nearest terms box, in case this is the only ## collection being searched and it returns no results? if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, -963, jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) req.write(print_hosted_results(url_and_engine=result[0], ln=ln, of=of, req=req, no_records_found=True, limit=rg)) req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, -963, jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) else: # these are the searches that actually returned results on time if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, result[1], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) req.write(print_hosted_results(url_and_engine=result[0], ln=ln, of=of, req=req, limit=rg)) if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, result[1], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) if hosted_colls_timeouts_timeouts: for timeout in hosted_colls_timeouts_timeouts: if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, timeout[1].name, -963, jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) req.write(print_hosted_results(url_and_engine=timeout[0], ln=ln, of=of, req=req, search_timed_out=True, limit=rg)) req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, timeout[1].name, -963, jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) print_records_epilogue(req, of) if f == "author" and of.startswith("h"): req.write(create_similarly_named_authors_link_box(p, ln)) # log query: try: id_query = log_query(req.remote_host, req.args, uid) if of.startswith("h") and id_query: if not of in ['hcs']: # display alert/RSS teaser for non-summary formats: user_info = collect_user_info(req) display_email_alert_part = True if user_info: if user_info['email'] == 'guest': if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS > 4: display_email_alert_part = False else: if not user_info['precached_usealerts']: display_email_alert_part = False req.write(websearch_templates.tmpl_alert_rss_teaser_box_for_query(id_query, \ ln=ln, display_email_alert_part=display_email_alert_part)) except: # do not log query if req is None (used by CLI interface) pass log_query_info("ss", p, f, colls_to_search, results_final_nb_total) # External searches if of.startswith("h"): if not of in ['hcs']: perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) def perform_request_cache(req, action="show"): """Manipulates the search engine cache.""" req.content_type = "text/html" req.send_http_header() req.write("") out = "" out += "

Search Cache

" # clear cache if requested: if action == "clear": search_results_cache.clear() req.write(out) # show collection reclist cache: out = "

Collection reclist cache

" out += "- collection table last updated: %s" % get_table_update_time('collection') out += "
- reclist cache timestamp: %s" % collection_reclist_cache.timestamp out += "
- reclist cache contents:" out += "
" for coll in collection_reclist_cache.cache.keys(): if collection_reclist_cache.cache[coll]: out += "%s (%d)
" % (coll, len(collection_reclist_cache.cache[coll])) out += "
" req.write(out) # show search results cache: out = "

Search Cache

" out += "- search cache usage: %d queries cached (max. ~%d)" % \ (len(search_results_cache.cache), CFG_WEBSEARCH_SEARCH_CACHE_SIZE) if len(search_results_cache.cache): out += "
- search cache contents:" out += "
" for query, hitset in search_results_cache.cache.items(): out += "
%s ... %s" % (query, hitset) out += """

clear search results cache""" % CFG_SITE_URL out += "

" req.write(out) # show field i18nname cache: out = "

Field I18N names cache

" out += "- fieldname table last updated: %s" % get_table_update_time('fieldname') out += "
- i18nname cache timestamp: %s" % field_i18nname_cache.timestamp out += "
- i18nname cache contents:" out += "
" for field in field_i18nname_cache.cache.keys(): for ln in field_i18nname_cache.cache[field].keys(): out += "%s, %s = %s
" % (field, ln, field_i18nname_cache.cache[field][ln]) out += "
" req.write(out) # show collection i18nname cache: out = "

Collection I18N names cache

" out += "- collectionname table last updated: %s" % get_table_update_time('collectionname') out += "
- i18nname cache timestamp: %s" % collection_i18nname_cache.timestamp out += "
- i18nname cache contents:" out += "
" for coll in collection_i18nname_cache.cache.keys(): for ln in collection_i18nname_cache.cache[coll].keys(): out += "%s, %s = %s
" % (coll, ln, collection_i18nname_cache.cache[coll][ln]) out += "
" req.write(out) req.write("") return "\n" def perform_request_log(req, date=""): """Display search log information for given date.""" req.content_type = "text/html" req.send_http_header() req.write("") req.write("

Search Log

") if date: # case A: display stats for a day yyyymmdd = string.atoi(date) req.write("

Date: %d

" % yyyymmdd) req.write("""""") req.write("" % ("No.", "Time", "Pattern", "Field", "Collection", "Number of Hits")) # read file: p = os.popen("grep ^%d %s/search.log" % (yyyymmdd, CFG_LOGDIR), 'r') lines = p.readlines() p.close() # process lines: i = 0 for line in lines: try: datetime, aas, p, f, c, nbhits = string.split(line,"#") i += 1 req.write("" \ % (i, datetime[8:10], datetime[10:12], datetime[12:], p, f, c, nbhits)) except: pass # ignore eventual wrong log lines req.write("
%s%s%s%s%s%s
#%d%s:%s:%s%s%s%s%s
") else: # case B: display summary stats per day yyyymm01 = int(time.strftime("%Y%m01", time.localtime())) yyyymmdd = int(time.strftime("%Y%m%d", time.localtime())) req.write("""""") req.write("" % ("Day", "Number of Queries")) for day in range(yyyymm01, yyyymmdd + 1): p = os.popen("grep -c ^%d %s/search.log" % (day, CFG_LOGDIR), 'r') for line in p.readlines(): req.write("""""" % \ (day, CFG_SITE_URL, day, line)) p.close() req.write("
%s%s
%s%s
") req.write("") return "\n" def get_most_popular_field_values(recids, tags, exclude_values=None, count_repetitive_values=True): """ Analyze RECIDS and look for TAGS and return most popular values and the frequency with which they occur sorted according to descending frequency. If a value is found in EXCLUDE_VALUES, then do not count it. If COUNT_REPETITIVE_VALUES is True, then we count every occurrence of value in the tags. If False, then we count the value only once regardless of the number of times it may appear in a record. (But, if the same value occurs in another record, we count it, of course.) Example: >>> get_most_popular_field_values(range(11,20), '980__a') (('PREPRINT', 10), ('THESIS', 7), ...) >>> get_most_popular_field_values(range(11,20), ('100__a', '700__a')) (('Ellis, J', 10), ('Ellis, N', 7), ...) >>> get_most_popular_field_values(range(11,20), ('100__a', '700__a'), ('Ellis, J')) (('Ellis, N', 7), ...) """ def _get_most_popular_field_values_helper_sorter(val1, val2): "Compare VAL1 and VAL2 according to, firstly, frequency, then secondly, alphabetically." compared_via_frequencies = cmp(valuefreqdict[val2], valuefreqdict[val1]) if compared_via_frequencies == 0: return cmp(val1.lower(), val2.lower()) else: return compared_via_frequencies valuefreqdict = {} ## sanity check: if not exclude_values: exclude_values = [] if isinstance(tags, str): tags = (tags,) ## find values to count: vals_to_count = [] displaytmp = {} if count_repetitive_values: # counting technique A: can look up many records at once: (very fast) for tag in tags: vals_to_count.extend(get_fieldvalues(recids, tag)) else: # counting technique B: must count record-by-record: (slow) for recid in recids: vals_in_rec = [] for tag in tags: for val in get_fieldvalues(recid, tag, False): vals_in_rec.append(val) # do not count repetitive values within this record # (even across various tags, so need to unify again): dtmp = {} for val in vals_in_rec: dtmp[val.lower()] = 1 displaytmp[val.lower()] = val vals_in_rec = dtmp.keys() vals_to_count.extend(vals_in_rec) ## are we to exclude some of found values? for val in vals_to_count: if val not in exclude_values: if valuefreqdict.has_key(val): valuefreqdict[val] += 1 else: valuefreqdict[val] = 1 ## sort by descending frequency of values: out = () vals = valuefreqdict.keys() vals.sort(_get_most_popular_field_values_helper_sorter) for val in vals: tmpdisplv = '' if displaytmp.has_key(val): tmpdisplv = displaytmp[val] else: tmpdisplv = val out += (tmpdisplv, valuefreqdict[val]), return out def profile(p="", f="", c=CFG_SITE_NAME): """Profile search time.""" import profile import pstats profile.run("perform_request_search(p='%s',f='%s', c='%s')" % (p, f, c), "perform_request_search_profile") p = pstats.Stats("perform_request_search_profile") p.strip_dirs().sort_stats("cumulative").print_stats() return 0 ## test cases: #print wash_colls(CFG_SITE_NAME,"Library Catalogue", 0) #print wash_colls("Periodicals & Progress Reports",["Periodicals","Progress Reports"], 0) #print wash_field("wau") #print print_record(20,"tm","001,245") #print create_opft_search_units(None, "PHE-87-13","reportnumber") #print ":"+wash_pattern("* and % doo * %")+":\n" #print ":"+wash_pattern("*")+":\n" #print ":"+wash_pattern("ellis* ell* e*%")+":\n" #print run_sql("SELECT name,dbquery from collection") #print get_index_id("author") #print get_coll_ancestors("Theses") #print get_coll_sons("Articles & Preprints") #print get_coll_real_descendants("Articles & Preprints") #print get_collection_reclist("Theses") #print log(sys.stdin) #print search_unit_in_bibrec('2002-12-01','2002-12-12') #print get_nearest_terms_in_bibxxx("ellis", "author", 5, 5) #print call_bibformat(68, "HB_FLY") #print get_fieldvalues(10, "980__a") #print get_fieldvalues_alephseq_like(10,"001___") #print get_fieldvalues_alephseq_like(10,"980__a") #print get_fieldvalues_alephseq_like(10,"foo") #print get_fieldvalues_alephseq_like(10,"-1") #print get_fieldvalues_alephseq_like(10,"99") #print get_fieldvalues_alephseq_like(10,["001", "980"]) ## profiling: #profile("of the this") #print perform_request_search(p="ellis") diff --git a/modules/websearch/lib/websearch_regression_tests.py b/modules/websearch/lib/websearch_regression_tests.py index 6bb88ccff..36d1742a4 100644 --- a/modules/websearch/lib/websearch_regression_tests.py +++ b/modules/websearch/lib/websearch_regression_tests.py @@ -1,1621 +1,1674 @@ # -*- coding: utf-8 -*- ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable=C0301 # pylint: disable=E1102 """WebSearch module regression tests.""" __revision__ = "$Id$" import unittest import re import urlparse, cgi import sys if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 from mechanize import Browser, LinkNotFoundError from invenio.config import CFG_SITE_URL, CFG_SITE_NAME, CFG_SITE_LANG from invenio.testutils import make_test_suite, \ run_test_suite, \ make_url, test_web_page_content, \ merge_error_messages from invenio.urlutils import same_urls_p from invenio.search_engine import perform_request_search, \ guess_primary_collection_of_a_record, guess_collection_of_a_record, \ collection_restricted_p, get_permitted_restricted_collections, \ get_fieldvalues def parse_url(url): parts = urlparse.urlparse(url) query = cgi.parse_qs(parts[4], True) return parts[2].split('/')[1:], query class WebSearchWebPagesAvailabilityTest(unittest.TestCase): """Check WebSearch web pages whether they are up or not.""" def test_search_interface_pages_availability(self): """websearch - availability of search interface pages""" baseurl = CFG_SITE_URL + '/' _exports = ['', 'collection/Poetry', 'collection/Poetry?as=1'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_search_results_pages_availability(self): """websearch - availability of search results pages""" baseurl = CFG_SITE_URL + '/search' _exports = ['', '?c=Poetry', '?p=ellis', '/cache', '/log'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_search_detailed_record_pages_availability(self): """websearch - availability of search detailed record pages""" baseurl = CFG_SITE_URL + '/record/' _exports = ['', '1', '1/', '1/files', '1/files/'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_browse_results_pages_availability(self): """websearch - availability of browse results pages""" baseurl = CFG_SITE_URL + '/search' _exports = ['?p=ellis&f=author&action_browse=Browse'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_help_page_availability(self): """websearch - availability of Help Central page""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help', expected_text="Help Central")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/?ln=fr', expected_text="Centre d'aide")) def test_search_tips_page_availability(self): """websearch - availability of Search Tips""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-tips', expected_text="Search Tips")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-tips?ln=fr', expected_text="Conseils de recherche")) def test_search_guide_page_availability(self): """websearch - availability of Search Guide""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-guide', expected_text="Search Guide")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-guide?ln=fr', expected_text="Guide de recherche")) class WebSearchTestLegacyURLs(unittest.TestCase): """ Check that the application still responds to legacy URLs for navigating, searching and browsing.""" def test_legacy_collections(self): """ websearch - collections handle legacy urls """ browser = Browser() def check(legacy, new, browser=browser): browser.open(legacy) got = browser.geturl() self.failUnless(same_urls_p(got, new), got) # Use the root URL unless we need more check(make_url('/', c=CFG_SITE_NAME), make_url('/', ln=CFG_SITE_LANG)) # Other collections are redirected in the /collection area check(make_url('/', c='Poetry'), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) # Drop unnecessary arguments, like ln and as (when they are # the default value) args = {'as': 0} check(make_url('/', c='Poetry', **args), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) # Otherwise, keep them args = {'as': 1, 'ln': CFG_SITE_LANG} check(make_url('/', c='Poetry', **args), make_url('/collection/Poetry', **args)) # Support the /index.py addressing too check(make_url('/index.py', c='Poetry'), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) def test_legacy_search(self): """ websearch - search queries handle legacy urls """ browser = Browser() def check(legacy, new, browser=browser): browser.open(legacy) got = browser.geturl() self.failUnless(same_urls_p(got, new), got) # /search.py is redirected on /search # Note that `as' is a reserved word in Python 2.5 check(make_url('/search.py', p='nuclear', ln='en') + 'as=1', make_url('/search', p='nuclear', ln='en') + 'as=1') # direct recid searches are redirected to /record check(make_url('/search.py', recid=1, ln='es'), make_url('/record/1', ln='es')) def test_legacy_search_help_link(self): """websearch - legacy Search Help page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/index.en.html', expected_text="Help Central")) def test_legacy_search_tips_link(self): """websearch - legacy Search Tips page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/tips.fr.html', expected_text="Conseils de recherche")) def test_legacy_search_guide_link(self): """websearch - legacy Search Guide page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/guide.en.html', expected_text="Search Guide")) class WebSearchTestRecord(unittest.TestCase): """ Check the interface of the /record results """ def test_format_links(self): """ websearch - check format links for records """ browser = Browser() # We open the record in all known HTML formats for hformat in ('hd', 'hx', 'hm'): browser.open(make_url('/record/1', of=hformat)) if hformat == 'hd': # hd format should have a link to the following # formats for oformat in ('hx', 'hm', 'xm', 'xd'): target = make_url('/record/1/export/%s?ln=en' % oformat) try: browser.find_link(url=target) except LinkNotFoundError: self.fail('link %r should be in page' % target) else: # non-hd HTML formats should have a link back to # the main detailed record target = make_url('/record/1') try: browser.find_link(url=target) except LinkNotFoundError: self.fail('link %r should be in page' % target) return def test_exported_formats(self): """ websearch - check formats exported through /record/1/export/ URLs""" browser = Browser() self.assertEqual([], test_web_page_content(make_url('/record/1/export/hm'), expected_text='245__ $$aALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/hd'), expected_text='ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/xm'), expected_text='ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/xd'), expected_text='ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/hs'), expected_text='ALEPH experiment' % \ CFG_SITE_LANG)) self.assertEqual([], test_web_page_content(make_url('/record/1/export/hx'), expected_text='title = "ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/t?ot=245'), expected_text='245__ $$aALEPH experiment')) self.assertNotEqual([], test_web_page_content(make_url('/record/1/export/t?ot=245'), expected_text='001__')) self.assertEqual([], test_web_page_content(make_url('/record/1/export/h?ot=245'), expected_text='245__ $$aALEPH experiment')) self.assertNotEqual([], test_web_page_content(make_url('/record/1/export/h?ot=245'), expected_text='001__')) return class WebSearchTestCollections(unittest.TestCase): def test_traversal_links(self): """ websearch - traverse all the publications of a collection """ browser = Browser() try: for aas in (0, 1): args = {'as': aas} browser.open(make_url('/collection/Preprints', **args)) for jrec in (11, 21, 11, 28): args = {'jrec': jrec, 'cc': 'Preprints'} if aas: args['as'] = aas url = make_url('/search', **args) try: browser.follow_link(url=url) except LinkNotFoundError: args['ln'] = CFG_SITE_LANG url = make_url('/search', **args) browser.follow_link(url=url) except LinkNotFoundError: self.fail('no link %r in %r' % (url, browser.geturl())) def test_collections_links(self): """ websearch - enter in collections and subcollections """ browser = Browser() def tryfollow(url): cur = browser.geturl() body = browser.response().read() try: browser.follow_link(url=url) except LinkNotFoundError: print body self.fail("in %r: could not find %r" % ( cur, url)) return for aas in (0, 1): if aas: kargs = {'as': 1} else: kargs = {} kargs['ln'] = CFG_SITE_LANG # We navigate from immediate son to immediate son... browser.open(make_url('/', **kargs)) tryfollow(make_url('/collection/Articles%20%26%20Preprints', **kargs)) tryfollow(make_url('/collection/Articles', **kargs)) # But we can also jump to a grandson immediately browser.back() browser.back() tryfollow(make_url('/collection/ALEPH', **kargs)) return def test_records_links(self): """ websearch - check the links toward records in leaf collections """ browser = Browser() browser.open(make_url('/collection/Preprints')) def harvest(): """ Parse all the links in the page, and check that for each link to a detailed record, we also have the corresponding link to the similar records.""" records = set() similar = set() for link in browser.links(): path, q = parse_url(link.url) if not path: continue if path[0] == 'record': records.add(int(path[1])) continue if path[0] == 'search': if not q.get('rm') == ['wrd']: continue recid = q['p'][0].split(':')[1] similar.add(int(recid)) self.failUnlessEqual(records, similar) return records # We must have 10 links to the corresponding /records found = harvest() self.failUnlessEqual(len(found), 10) # When clicking on the "Search" button, we must also have # these 10 links on the records. browser.select_form(name="search") browser.submit() found = harvest() self.failUnlessEqual(len(found), 10) return class WebSearchTestBrowse(unittest.TestCase): def test_browse_field(self): """ websearch - check that browsing works """ browser = Browser() browser.open(make_url('/')) browser.select_form(name='search') browser['f'] = ['title'] browser.submit(name='action_browse') def collect(): # We'll get a few links to search for the actual hits, plus a # link to the following results. res = [] for link in browser.links(url_regex=re.compile(CFG_SITE_URL + r'/search\?')): if link.text == 'Advanced Search': continue dummy, q = parse_url(link.url) res.append((link, q)) return res # if we follow the last link, we should get another # batch. There is an overlap of one item. batch_1 = collect() browser.follow_link(link=batch_1[-1][0]) batch_2 = collect() # FIXME: we cannot compare the whole query, as the collection # set is not equal self.failUnlessEqual(batch_1[-2][1]['p'], batch_2[0][1]['p']) class WebSearchTestOpenURL(unittest.TestCase): def test_isbn_01(self): """ websearch - isbn query via OpenURL 0.1""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl', isbn='0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) def test_isbn_10_rft_id(self): """ websearch - isbn query via OpenURL 1.0 - rft_id""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl', rft_id='urn:ISBN:0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) def test_isbn_10(self): """ websearch - isbn query via OpenURL 1.0""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl?rft.isbn=0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) class WebSearchTestSearch(unittest.TestCase): def test_hits_in_other_collection(self): """ websearch - check extension of a query to the home collection """ browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/collection/ISOLDE', ln='en')) browser.select_form(name='search') browser['f'] = ['author'] browser['p'] = 'matsubara' browser.submit() dummy, current_q = parse_url(browser.geturl()) link = browser.find_link(text_regex=re.compile('.*hit', re.I)) dummy, target_q = parse_url(link.url) # the target query should be the current query without any c # or cc specified. for f in ('cc', 'c', 'action_search'): if f in current_q: del current_q[f] self.failUnlessEqual(current_q, target_q) def test_nearest_terms(self): """ websearch - provide a list of nearest terms """ browser = Browser() browser.open(make_url('')) # Search something weird browser.select_form(name='search') browser['p'] = 'gronf' browser.submit() dummy, original = parse_url(browser.geturl()) for to_drop in ('cc', 'action_search', 'f'): if to_drop in original: del original[to_drop] if 'ln' not in original: original['ln'] = [CFG_SITE_LANG] # we should get a few searches back, which are identical # except for the p field being substituted (and the cc field # being dropped). if 'cc' in original: del original['cc'] for link in browser.links(url_regex=re.compile(CFG_SITE_URL + r'/search\?')): if link.text == 'Advanced Search': continue dummy, target = parse_url(link.url) if 'ln' not in target: target['ln'] = [CFG_SITE_LANG] original['p'] = [link.text] self.failUnlessEqual(original, target) return def test_switch_to_simple_search(self): """ websearch - switch to simple search """ browser = Browser() args = {'as': 1} browser.open(make_url('/collection/ISOLDE', **args)) browser.select_form(name='search') browser['p1'] = 'tandem' browser['f1'] = ['title'] browser.submit() browser.follow_link(text='Simple Search') dummy, q = parse_url(browser.geturl()) self.failUnlessEqual(q, {'cc': ['ISOLDE'], 'p': ['tandem'], 'f': ['title'], 'ln': ['en']}) def test_switch_to_advanced_search(self): """ websearch - switch to advanced search """ browser = Browser() browser.open(make_url('/collection/ISOLDE')) browser.select_form(name='search') browser['p'] = 'tandem' browser['f'] = ['title'] browser.submit() browser.follow_link(text='Advanced Search') dummy, q = parse_url(browser.geturl()) self.failUnlessEqual(q, {'cc': ['ISOLDE'], 'p1': ['tandem'], 'f1': ['title'], 'as': ['1'], 'ln' : ['en']}) def test_no_boolean_hits(self): """ websearch - check the 'no boolean hits' proposed links """ browser = Browser() browser.open(make_url('')) browser.select_form(name='search') browser['p'] = 'quasinormal muon' browser.submit() dummy, q = parse_url(browser.geturl()) for to_drop in ('cc', 'action_search', 'f'): if to_drop in q: del q[to_drop] for bsu in ('quasinormal', 'muon'): l = browser.find_link(text=bsu) q['p'] = bsu if not same_urls_p(l.url, make_url('/search', **q)): self.fail(repr((l.url, make_url('/search', **q)))) def test_similar_authors(self): """ websearch - test similar authors box """ browser = Browser() browser.open(make_url('')) browser.select_form(name='search') browser['p'] = 'Ellis, R K' browser['f'] = ['author'] browser.submit() l = browser.find_link(text="Ellis, R S") self.failUnless(same_urls_p(l.url, make_url('/search', p="Ellis, R S", f='author', ln='en'))) class WebSearchNearestTermsTest(unittest.TestCase): """Check various alternatives of searches leading to the nearest terms box.""" def test_nearest_terms_box_in_okay_query(self): """ websearch - no nearest terms box for a successful query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text="jump to record")) def test_nearest_terms_box_in_unsuccessful_simple_query(self): """ websearch - nearest terms box for unsuccessful simple query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=embed", expected_link_label='embed')) def test_nearest_terms_box_in_unsuccessful_simple_accented_query(self): """ websearch - nearest terms box for unsuccessful accented query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=elliszà', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=embed", expected_link_label='embed')) def test_nearest_terms_box_in_unsuccessful_structured_query(self): """ websearch - nearest terms box for unsuccessful structured query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellisz&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=fabbro&f=author", expected_link_label='fabbro')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3Aellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=author%3Afabbro", expected_link_label='fabbro')) def test_nearest_terms_box_in_unsuccessful_phrase_query(self): """ websearch - nearest terms box for unsuccessful phrase query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis%2C+Z%22', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=author%3A%22Enqvist%2C+K%22", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%22ellisz%22&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%22Enqvist%2C+K%22&f=author", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%22elliszà%22&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%22Enqvist%2C+K%22&f=author", expected_link_label='Enqvist, K')) def test_nearest_terms_box_in_unsuccessful_boolean_query(self): """ websearch - nearest terms box for unsuccessful boolean query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3Aellisz+author%3Aellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aellisz", expected_link_label='energi')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3Aenergi+author%3Aenergie', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aenqvist", expected_link_label='enqvist')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?ln=en&p=title%3Aellisz+author%3Aellisz&f=keyword', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aellisz&f=keyword", expected_link_label='energi')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?ln=en&p=title%3Aenergi+author%3Aenergie&f=keyword', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aenqvist&f=keyword", expected_link_label='enqvist')) class WebSearchBooleanQueryTest(unittest.TestCase): """Check various boolean queries.""" def test_successful_boolean_query(self): """ websearch - successful boolean query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis+muon', expected_text="records found", expected_link_label="Detailed record")) def test_unsuccessful_boolean_query_where_all_individual_terms_match(self): """ websearch - unsuccessful boolean query where all individual terms match """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis+muon+letter', expected_text="Boolean query returned no hits. Please combine your search terms differently.")) class WebSearchAuthorQueryTest(unittest.TestCase): """Check various author-related queries.""" def test_propose_similar_author_names_box(self): """ websearch - propose similar author names box """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=Ellis%2C+R&f=author', expected_text="See also: similar author names", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=Ellis%2C+R+K&f=author", expected_link_label="Ellis, R K")) def test_do_not_propose_similar_author_names_box(self): """ websearch - do not propose similar author names box """ errmsgs = test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis%2C+R%22', expected_link_target=CFG_SITE_URL+"/search?ln=en&p=Ellis%2C+R+K&f=author", expected_link_label="Ellis, R K") if errmsgs[0].find("does not contain link to") > -1: pass else: self.fail("Should not propose similar author names box.") return class WebSearchSearchEnginePythonAPITest(unittest.TestCase): """Check typical search engine Python API calls on the demo data.""" def test_search_engine_python_api_for_failed_query(self): """websearch - search engine Python API for failed query""" self.assertEqual([], perform_request_search(p='aoeuidhtns')) def test_search_engine_python_api_for_successful_query(self): """websearch - search engine Python API for successful query""" self.assertEqual([8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 47], perform_request_search(p='ellis')) def test_search_engine_python_api_for_existing_record(self): """websearch - search engine Python API for existing record""" self.assertEqual([8], perform_request_search(recid=8)) def test_search_engine_python_api_for_nonexisting_record(self): """websearch - search engine Python API for non-existing record""" self.assertEqual([], perform_request_search(recid=1234567809)) def test_search_engine_python_api_for_nonexisting_collection(self): """websearch - search engine Python API for non-existing collection""" self.assertEqual([], perform_request_search(c='Foo')) def test_search_engine_python_api_for_range_of_records(self): """websearch - search engine Python API for range of records""" self.assertEqual([1, 2, 3, 4, 5, 6, 7, 8, 9], perform_request_search(recid=1, recidb=10)) def test_search_engine_python_api_ranked_by_citation(self): """websearch - search engine Python API for citation ranking""" self.assertEqual([82, 83, 87, 89], perform_request_search(p='recid:81', rm='citation')) def test_search_engine_python_api_textmarc(self): """websearch - search engine Python API for Text MARC output""" # we are testing example from /help/hacking/search-engine-api import cStringIO tmp = cStringIO.StringIO() perform_request_search(req=tmp, p='higgs', of='tm', ot=['100', '700']) out = tmp.getvalue() tmp.close() self.assertEqual(out, """\ 000000085 100__ $$aGirardello, L$$uINFN$$uUniversita di Milano-Bicocca 000000085 700__ $$aPorrati, Massimo 000000085 700__ $$aZaffaroni, A 000000001 100__ $$aPhotolab """) class WebSearchSearchEngineWebAPITest(unittest.TestCase): """Check typical search engine Web API calls on the demo data.""" def test_search_engine_web_api_for_failed_query(self): """websearch - search engine Web API for failed query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=aoeuidhtns&of=id', expected_text="[]")) def test_search_engine_web_api_for_successful_query(self): """websearch - search engine Web API for successful query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis&of=id', expected_text="[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 47]")) def test_search_engine_web_api_for_existing_record(self): """websearch - search engine Web API for existing record""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=8&of=id', expected_text="[8]")) def test_search_engine_web_api_for_nonexisting_record(self): """websearch - search engine Web API for non-existing record""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=123456789&of=id', expected_text="[]")) def test_search_engine_web_api_for_nonexisting_collection(self): """websearch - search engine Web API for non-existing collection""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?c=Foo&of=id', expected_text="[]")) def test_search_engine_web_api_for_range_of_records(self): """websearch - search engine Web API for range of records""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=1&recidb=10&of=id', expected_text="[1, 2, 3, 4, 5, 6, 7, 8, 9]")) class WebSearchRestrictedCollectionTest(unittest.TestCase): """Test of the restricted Theses collection behaviour.""" def test_restricted_collection_interface_page(self): """websearch - restricted collection interface page body""" # there should be no Latest additions box for restricted collections self.assertNotEqual([], test_web_page_content(CFG_SITE_URL + '/collection/Theses', expected_text="Latest additions")) def test_restricted_search_as_anonymous_guest(self): """websearch - restricted collection not searchable by anonymous guest""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') response = browser.response().read() if response.find("If you think you have right to access it, please authenticate yourself.") > -1: pass else: self.fail("Oops, searching restricted collection without password should have redirected to login dialog.") return def test_restricted_search_as_authorized_person(self): """websearch - restricted collection searchable by authorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') browser.select_form(nr=0) browser['p_un'] = 'jekyll' browser['p_pw'] = 'j123ekyll' browser.submit() if browser.response().read().find("records found") > -1: pass else: self.fail("Oops, Dr. Jekyll should be able to search Theses collection.") def test_restricted_search_as_unauthorized_person(self): """websearch - restricted collection not searchable by unauthorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') browser.select_form(nr=0) browser['p_un'] = 'hyde' browser['p_pw'] = 'h123yde' browser.submit() # Mr. Hyde should not be able to connect: if browser.response().read().find("Authorization failure") <= -1: # if we got here, things are broken: self.fail("Oops, Mr.Hyde should not be able to search Theses collection.") def test_restricted_detailed_record_page_as_anonymous_guest(self): """websearch - restricted detailed record page not accessible to guests""" browser = Browser() browser.open(CFG_SITE_URL + '/record/35') if browser.response().read().find("You can use your nickname or your email address to login.") > -1: pass else: self.fail("Oops, searching restricted collection without password should have redirected to login dialog.") return def test_restricted_detailed_record_page_as_authorized_person(self): """websearch - restricted detailed record page accessible to authorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/youraccount/login') browser.select_form(nr=0) browser['p_un'] = 'jekyll' browser['p_pw'] = 'j123ekyll' browser.submit() browser.open(CFG_SITE_URL + '/record/35') # Dr. Jekyll should be able to connect # (add the pw to the whole CFG_SITE_URL because we shall be # redirected to '/reordrestricted/'): if browser.response().read().find("A High-performance Video Browsing System") > -1: pass else: self.fail("Oops, Dr. Jekyll should be able to access restricted detailed record page.") def test_restricted_detailed_record_page_as_unauthorized_person(self): """websearch - restricted detailed record page not accessible to unauthorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/youraccount/login') browser.select_form(nr=0) browser['p_un'] = 'hyde' browser['p_pw'] = 'h123yde' browser.submit() browser.open(CFG_SITE_URL + '/record/35') # Mr. Hyde should not be able to connect: if browser.response().read().find('You are not authorized') <= -1: # if we got here, things are broken: self.fail("Oops, Mr.Hyde should not be able to access restricted detailed record page.") def test_collection_restricted_p(self): """websearch - collection_restricted_p""" self.failUnless(collection_restricted_p('Theses'), True) self.failIf(collection_restricted_p('Books & Reports')) def test_get_permitted_restricted_collections(self): """websearch - get_permitted_restricted_collections""" from invenio.webuser import get_uid_from_email, collect_user_info self.assertEqual(get_permitted_restricted_collections(collect_user_info(get_uid_from_email('jekyll@cds.cern.ch'))), ['Theses']) self.assertEqual(get_permitted_restricted_collections(collect_user_info(get_uid_from_email('hyde@cds.cern.ch'))), []) class WebSearchRestrictedPicturesTest(unittest.TestCase): """ Check whether restricted pictures on the demo site can be accessed well by people who have rights to access them. """ def test_restricted_pictures_guest(self): """websearch - restricted pictures not available to guest""" error_messages = test_web_page_content(CFG_SITE_URL + '/record/1/files/0106015_01.jpg', expected_text=['This file is restricted. If you think you have right to access it, please authenticate yourself.']) if error_messages: self.fail(merge_error_messages(error_messages)) def test_restricted_pictures_romeo(self): """websearch - restricted pictures available to Romeo""" error_messages = test_web_page_content(CFG_SITE_URL + '/record/1/files/0106015_01.jpg', username='romeo', password='r123omeo', expected_text=[], unexpected_text=['This file is restricted', 'You are not authorized']) if error_messages: self.fail(merge_error_messages(error_messages)) def test_restricted_pictures_hyde(self): """websearch - restricted pictures not available to Mr. Hyde""" error_messages = test_web_page_content(CFG_SITE_URL + '/record/1/files/0106015_01.jpg', username='hyde', password='h123yde', expected_text=['This file is restricted', 'You are not authorized']) if error_messages: self.failUnless("HTTP Error 401: Unauthorized" in merge_error_messages(error_messages)) class WebSearchRSSFeedServiceTest(unittest.TestCase): """Test of the RSS feed service.""" def test_rss_feed_service(self): """websearch - RSS feed service""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/rss', expected_text=' -1: self.fail("Oops, when split by collection is off, " "results overview should not be present.") if body.find('') == -1: self.fail("Oops, when split by collection is off, " "Atlantis collection should be found.") if body.find('') > -1: self.fail("Oops, when split by collection is off, " "Multimedia & Arts should not be found.") try: browser.find_link(url='#15') self.fail("Oops, when split by collection is off, " "a link to Multimedia & Arts should not be found.") except LinkNotFoundError: pass def test_results_overview_split_on(self): """websearch - results overview box when split by collection is on""" browser = Browser() browser.open(CFG_SITE_URL + '/search?p=of&sc=1') body = browser.response().read() if body.find("Results overview") == -1: self.fail("Oops, when split by collection is on, " "results overview should be present.") if body.find('') > -1: self.fail("Oops, when split by collection is on, " "Atlantis collection should not be found.") if body.find('') == -1: self.fail("Oops, when split by collection is on, " "Multimedia & Arts should be found.") try: browser.find_link(url='#15') except LinkNotFoundError: self.fail("Oops, when split by collection is on, " "a link to Multimedia & Arts should be found.") class WebSearchSortResultsTest(unittest.TestCase): """Test of the search results page's sorting capability.""" def test_sort_results_default(self): """websearch - search results sorting, default method""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1', expected_text="[TESLA-FEL-99-07]")) def test_sort_results_ascending(self): """websearch - search results sorting, ascending field""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=a', expected_text="ISOLTRAP")) def test_sort_results_descending(self): """websearch - search results sorting, descending field""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=d', expected_text=" [TESLA-FEL-99-07]")) def test_sort_results_sort_pattern(self): """websearch - search results sorting, preferential sort pattern""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=d&sp=cern', expected_text="[CERN-TH-2002-069]")) class WebSearchSearchResultsXML(unittest.TestCase): """Test search results in various output""" def test_search_results_xm_output_split_on(self): """ websearch - check document element of search results in xm output (split by collection on)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=1&of=xm') body = browser.response().read() num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") def test_search_results_xm_output_split_off(self): """ websearch - check document element of search results in xm output (split by collection off)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=0&of=xm') body = browser.response().read() num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") def test_search_results_xd_output_split_on(self): """ websearch - check document element of search results in xd output (split by collection on)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=1&of=xd') body = browser.response().read() num_doc_element = body.count("" "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") def test_search_results_xd_output_split_off(self): """ websearch - check document element of search results in xd output (split by collection off)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=0&of=xd') body = browser.response().read() num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") class WebSearchUnicodeQueryTest(unittest.TestCase): """Test of the search results for queries containing Unicode characters.""" def test_unicode_word_query(self): """websearch - Unicode word query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%CE%99%CE%B8%CE%AC%CE%BA%CE%B7', expected_text="[76]")) def test_unicode_word_query_not_found_term(self): """websearch - Unicode word query, not found term""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3A%CE%99%CE%B8', expected_text="ιθάκη")) def test_unicode_exact_phrase_query(self): """websearch - Unicode exact phrase query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%22%CE%99%CE%B8%CE%AC%CE%BA%CE%B7%22', expected_text="[76]")) def test_unicode_partial_phrase_query(self): """websearch - Unicode partial phrase query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%27%CE%B7%27', expected_text="[76]")) def test_unicode_regexp_query(self): """websearch - Unicode regexp query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%2F%CE%B7%2F', expected_text="[76]")) class WebSearchMARCQueryTest(unittest.TestCase): """Test of the search results for queries containing physical MARC tags.""" def test_single_marc_tag_exact_phrase_query(self): """websearch - single MARC tag, exact phrase query (100__a)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=100__a%3A%22Ellis%2C+J%22', expected_text="[9, 14, 18]")) def test_single_marc_tag_partial_phrase_query(self): """websearch - single MARC tag, partial phrase query (245__b)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245__b%3A%27and%27', expected_text="[28]")) def test_many_marc_tags_partial_phrase_query(self): """websearch - many MARC tags, partial phrase query (245)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245%3A%27and%27', expected_text="[1, 8, 9, 14, 15, 20, 22, 24, 28, 33, 47, 48, 49, 51, 53, 64, 69, 71, 79, 82, 83, 85, 91, 96]")) def test_single_marc_tag_regexp_query(self): """websearch - single MARC tag, regexp query""" # NOTE: regexp queries for physical MARC tags (e.g. 245:/and/) # are not treated by the search engine by purpose. But maybe # we should support them?! self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245%3A%2Fand%2F', expected_text="[]")) class WebSearchExtSysnoQueryTest(unittest.TestCase): """Test of queries using external system numbers.""" def test_existing_sysno_html_output(self): """websearch - external sysno query, existing sysno, HTML output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CER', expected_text="The wall of the cave")) def test_existing_sysno_id_output(self): """websearch - external sysno query, existing sysno, ID output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CER&of=id', expected_text="[95]")) def test_nonexisting_sysno_html_output(self): """websearch - external sysno query, non-existing sysno, HTML output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CERRRR', expected_text="Requested record does not seem to exist.")) def test_nonexisting_sysno_id_output(self): """websearch - external sysno query, non-existing sysno, ID output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CERRRR&of=id', expected_text="[]")) class WebSearchResultsRecordGroupingTest(unittest.TestCase): """Test search results page record grouping (rg).""" def test_search_results_rg_guest(self): """websearch - search results, records in groups of, guest""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?rg=17', expected_text="1 - 17")) def test_search_results_rg_nonguest(self): """websearch - search results, records in groups of, non-guest""" # This test used to fail due to saved user preference fetching # not overridden by URL rg argument. self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?rg=17', username='admin', expected_text="1 - 17")) class WebSearchSpecialTermsQueryTest(unittest.TestCase): """Test of the search results for queries containing special terms.""" def test_special_terms_u1(self): """websearch - query for special terms, U(1)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29', expected_text="[57, 79, 80, 88]")) def test_special_terms_u1_and_sl(self): """websearch - query for special terms, U(1) SL(2,Z)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29+SL%282%2CZ%29', expected_text="[88]")) def test_special_terms_u1_and_sl_or(self): """websearch - query for special terms, U(1) OR SL(2,Z)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29+OR+SL%282%2CZ%29', expected_text="[57, 79, 80, 88]")) def test_special_terms_u1_and_sl_or_parens(self): """websearch - query for special terms, (U(1) OR SL(2,Z))""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=%28U%281%29+OR+SL%282%2CZ%29%29', expected_text="[57, 79, 80, 88]")) class WebSearchJournalQueryTest(unittest.TestCase): """Test of the search results for journal pubinfo queries.""" def test_query_journal_title_only(self): """websearch - journal publication info query, title only""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&f=journal&p=Phys.+Lett.+B', expected_text="[77, 78, 85, 87]")) def test_query_journal_full_pubinfo(self): """websearch - journal publication info query, full reference""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&f=journal&p=Phys.+Lett.+B+531+%282002%29+301', expected_text="[78]")) class WebSearchStemmedIndexQueryTest(unittest.TestCase): """Test of the search results for queries using stemmed indexes.""" def test_query_stemmed_lowercase(self): """websearch - stemmed index query, lowercase""" # note that dasse/Dasse is stemmed into dass/Dass, as expected self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=dasse', expected_text="[25, 26]")) def test_query_stemmed_uppercase(self): """websearch - stemmed index query, uppercase""" # ... but note also that DASSE is stemmed into DASSE(!); so # the test would fail if the search engine would not lower the # query term. (Something that is not necessary for # non-stemmed indexes.) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=DASSE', expected_text="[25, 26]")) class WebSearchSummarizerTest(unittest.TestCase): """Test of the search results summarizer functions.""" def test_most_popular_field_values_singletag(self): """websearch - most popular field values, simple tag""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('PREPRINT', 37), ('ARTICLE', 28), ('BOOK', 14), ('THESIS', 8), ('PICTURE', 7), ('POETRY', 2), ('REPORT', 2), ('ATLANTISTIMESNEWS', 1)), get_most_popular_field_values(range(0,100), '980__a')) def test_most_popular_field_values_singletag_multiexclusion(self): """websearch - most popular field values, simple tag, multiple exclusions""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('PREPRINT', 37), ('ARTICLE', 28), ('BOOK', 14), ('REPORT', 2), ('ATLANTISTIMESNEWS', 1)), get_most_popular_field_values(range(0,100), '980__a', ('THESIS', 'PICTURE', 'POETRY'))) def test_most_popular_field_values_multitag(self): """websearch - most popular field values, multiple tags""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('Ellis, J', 3), ('Enqvist, K', 1), ('Ibanez, L E', 1), ('Nanopoulos, D V', 1), ('Ross, G G', 1)), get_most_popular_field_values((9, 14, 18), ('100__a', '700__a'))) def test_most_popular_field_values_multitag_singleexclusion(self): """websearch - most popular field values, multiple tags, single exclusion""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('Enqvist, K', 1), ('Ibanez, L E', 1), ('Nanopoulos, D V', 1), ('Ross, G G', 1)), get_most_popular_field_values((9, 14, 18), ('100__a', '700__a'), ('Ellis, J'))) def test_most_popular_field_values_multitag_countrepetitive(self): """websearch - most popular field values, multiple tags, counting repetitive occurrences""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('THESIS', 2), ('REPORT', 1)), get_most_popular_field_values((41,), ('690C_a', '980__a'), count_repetitive_values=True)) self.assertEqual((('REPORT', 1), ('THESIS', 1)), get_most_popular_field_values((41,), ('690C_a', '980__a'), count_repetitive_values=False)) def test_ellis_citation_summary(self): """websearch - query ellis, citation summary output format""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis&of=hcs', expected_text="Less known papers (1-9)", expected_link_target=CFG_SITE_URL+"/search?p=ellis%20AND%20cited%3A1-%3E9&rm=citation", expected_link_label='1')) class WebSearchRecordCollectionGuessTest(unittest.TestCase): """Primary collection guessing tests.""" def test_guess_primary_collection_of_a_record(self): """websearch - guess_primary_collection_of_a_record""" self.assertEqual(guess_primary_collection_of_a_record(96), 'Articles') def test_guess_collection_of_a_record(self): """websearch - guess_collection_of_a_record""" self.assertEqual(guess_collection_of_a_record(96), 'Articles') self.assertEqual(guess_collection_of_a_record(96, '%s/collection/Theoretical Physics (TH)?ln=en' % CFG_SITE_URL), 'Articles') self.assertEqual(guess_collection_of_a_record(12, '%s/collection/Theoretical Physics (TH)?ln=en' % CFG_SITE_URL), 'Theoretical Physics (TH)') self.assertEqual(guess_collection_of_a_record(12, '%s/collection/Theoretical%%20Physics%%20%%28TH%%29?ln=en' % CFG_SITE_URL), 'Theoretical Physics (TH)') class WebSearchGetFieldValuesTest(unittest.TestCase): """Testing get_fieldvalues() function.""" def test_get_fieldvalues_001(self): """websearch - get_fieldvalues() for bibxxx-agnostic tags""" self.assertEqual(get_fieldvalues(10, '001___'), ['10']) def test_get_fieldvalues_980(self): """websearch - get_fieldvalues() for bibxxx-powered tags""" self.assertEqual(get_fieldvalues(18, '700__a'), ['Enqvist, K', 'Nanopoulos, D V']) self.assertEqual(get_fieldvalues(18, '909C1u'), ['CERN']) def test_get_fieldvalues_wildcard(self): """websearch - get_fieldvalues() for tag wildcards""" self.assertEqual(get_fieldvalues(18, '%'), []) self.assertEqual(get_fieldvalues(18, '7%'), []) self.assertEqual(get_fieldvalues(18, '700%'), ['Enqvist, K', 'Nanopoulos, D V']) self.assertEqual(get_fieldvalues(18, '909C0%'), ['1985', '13','TH']) def test_get_fieldvalues_recIDs(self): """websearch - get_fieldvalues() for list of recIDs""" self.assertEqual(get_fieldvalues([], '001___'), []) self.assertEqual(get_fieldvalues([], '700__a'), []) self.assertEqual(get_fieldvalues([10, 13], '001___'), ['10', '13']) self.assertEqual(get_fieldvalues([18, 13], '700__a'), ['Dawson, S', 'Ellis, R K', 'Enqvist, K', 'Nanopoulos, D V']) def test_get_fieldvalues_repetitive(self): """websearch - get_fieldvalues() for repetitive values""" self.assertEqual(get_fieldvalues([17, 18], '909C1u'), ['CERN', 'CERN']) self.assertEqual(get_fieldvalues([17, 18], '909C1u', repetitive_values=True), ['CERN', 'CERN']) self.assertEqual(get_fieldvalues([17, 18], '909C1u', repetitive_values=False), ['CERN']) class WebSearchAddToBasketTest(unittest.TestCase): """Test of the add-to-basket presence depending on user rights.""" def test_add_to_basket_guest(self): """websearch - add-to-basket facility allowed for guests""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='Add to basket')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='')) def test_add_to_basket_jekyll(self): """websearch - add-to-basket facility allowed for Dr. Jekyll""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='Add to basket', username='jekyll', password='j123ekyll')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='', username='jekyll', password='j123ekyll')) def test_add_to_basket_hyde(self): """websearch - add-to-basket facility denied to Mr. Hyde""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', unexpected_text='Add to basket', username='hyde', password='h123yde')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', unexpected_text='', username='hyde', password='h123yde')) class WebSearchAlertTeaserTest(unittest.TestCase): """Test of the alert teaser presence depending on user rights.""" def test_alert_teaser_guest(self): """websearch - alert teaser allowed for guests""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_link_label='email alert')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='RSS feed')) def test_alert_teaser_jekyll(self): """websearch - alert teaser allowed for Dr. Jekyll""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='email alert', username='jekyll', password='j123ekyll')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='RSS feed', username='jekyll', password='j123ekyll')) def test_alert_teaser_hyde(self): """websearch - alert teaser allowed for Mr. Hyde""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='email alert', username='hyde', password='h123yde')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='RSS feed', username='hyde', password='h123yde')) class WebSearchSpanQueryTest(unittest.TestCase): """Test of span queries.""" def test_span_in_word_index(self): """websearch - span query in a word index""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=year%3A1992-%3E1996&of=id&ap=0', expected_text='[17, 66, 69, 71]')) def test_span_in_phrase_index(self): """websearch - span query in a phrase index""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=year%3A%221992%22-%3E%221996%22&of=id&ap=0', expected_text='[17, 66, 69, 71]')) def test_span_in_bibxxx(self): """websearch - span query in MARC tables""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=909C0y%3A%221992%22-%3E%221996%22&of=id&ap=0', expected_text='[17, 66, 69, 71]')) def test_span_with_spaces(self): """websearch - no span query when a space is around""" # useful for reaction search self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3A%27mu%20--%3E%20e%27&of=id&ap=0', expected_text='[67]')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=245%3A%27mu%20--%3E%20e%27&of=id&ap=0', expected_text='[67]')) def test_span_in_author(self): """websearch - span query in special author index""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis,%20K%22-%3E%22Ellis,%20RZ%22&of=id&ap=0', expected_text='[8, 11, 13, 17, 47]')) +class WebSearchReferstoCitedbyTest(unittest.TestCase): + """Test of refersto/citedby search operators.""" + + def test_refersto_recid(self): + 'websearch - refersto:recid:84' + self.assertEqual([], + test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Arecid%3A84&of=id&ap=0', + expected_text='[85, 88, 91]')) + + def test_refersto_repno(self): + 'websearch - refersto:reportnumber:hep-th/0205061' + self.assertEqual([], + test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Areportnumber%3Ahep-th/0205061&of=id&ap=0', + expected_text='[91]')) + + def test_refersto_author_word(self): + 'websearch - refersto:author:klebanov' + self.assertEqual([], + test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Aauthor%3Aklebanov&of=id&ap=0', + expected_text='[85, 86, 88, 91]')) + + def test_refersto_author_phrase(self): + 'websearch - refersto:author:"Klebanov, I"' + self.assertEqual([], + test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Aauthor%3A%22Klebanov,%20I%22&of=id&ap=0', + expected_text='[85, 86, 88, 91]')) + + def test_citedby_recid(self): + 'websearch - citedby:recid:92' + self.assertEqual([], + test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Arecid%3A92&of=id&ap=0', + expected_text='[74, 91]')) + + def test_citedby_repno(self): + 'websearch - citedby:reportnumber:hep-th/0205061' + self.assertEqual([], + test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Areportnumber%3Ahep-th/0205061&of=id&ap=0', + expected_text='[78]')) + + def test_citedby_author_word(self): + 'websearch - citedby:author:klebanov' + self.assertEqual([], + test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Aauthor%3Aklebanov&of=id&ap=0', + expected_text='[95]')) + + def test_citedby_author_phrase(self): + 'websearch - citedby:author:"Klebanov, I"' + self.assertEqual([], + test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Aauthor%3A%22Klebanov,%20I%22&of=id&ap=0', + expected_text='[95]')) + + TEST_SUITE = make_test_suite(WebSearchWebPagesAvailabilityTest, WebSearchTestSearch, WebSearchTestBrowse, WebSearchTestOpenURL, WebSearchTestCollections, WebSearchTestRecord, WebSearchTestLegacyURLs, WebSearchNearestTermsTest, WebSearchBooleanQueryTest, WebSearchAuthorQueryTest, WebSearchSearchEnginePythonAPITest, WebSearchSearchEngineWebAPITest, WebSearchRestrictedCollectionTest, WebSearchRestrictedPicturesTest, WebSearchRSSFeedServiceTest, WebSearchXSSVulnerabilityTest, WebSearchResultsOverview, WebSearchSortResultsTest, WebSearchSearchResultsXML, WebSearchUnicodeQueryTest, WebSearchMARCQueryTest, WebSearchExtSysnoQueryTest, WebSearchResultsRecordGroupingTest, WebSearchSpecialTermsQueryTest, WebSearchJournalQueryTest, WebSearchStemmedIndexQueryTest, WebSearchSummarizerTest, WebSearchRecordCollectionGuessTest, WebSearchGetFieldValuesTest, WebSearchAddToBasketTest, WebSearchAlertTeaserTest, - WebSearchSpanQueryTest) + WebSearchSpanQueryTest, + WebSearchReferstoCitedbyTest) if __name__ == "__main__": run_test_suite(TEST_SUITE, warn_user=True)