diff --git a/modules/websearch/lib/search_engine.py b/modules/websearch/lib/search_engine.py index 6374ae432..1f2023f5c 100644 --- a/modules/websearch/lib/search_engine.py +++ b/modules/websearch/lib/search_engine.py @@ -1,5923 +1,5923 @@ # -*- coding: utf-8 -*- ## This file is part of Invenio. ## Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable=C0301 """Invenio Search Engine in mod_python.""" __lastupdated__ = """$Date$""" __revision__ = "$Id$" ## import general modules: import cgi import cStringIO import copy import string import os import re import time import urllib import urlparse import zlib import sys if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 ## import Invenio stuff: from invenio.config import \ CFG_CERN_SITE, \ CFG_INSPIRE_SITE, \ CFG_OAI_ID_FIELD, \ CFG_WEBCOMMENT_ALLOW_REVIEWS, \ CFG_WEBSEARCH_CALL_BIBFORMAT, \ CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX, \ CFG_WEBSEARCH_FIELDS_CONVERT, \ CFG_WEBSEARCH_NB_RECORDS_TO_SORT, \ CFG_WEBSEARCH_SEARCH_CACHE_SIZE, \ CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS, \ CFG_WEBSEARCH_USE_ALEPH_SYSNOS, \ CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS, \ CFG_WEBSEARCH_FULLTEXT_SNIPPETS, \ CFG_WEBSEARCH_DISPLAY_NEAREST_TERMS, \ CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE, \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, \ CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS, \ CFG_WEBSEARCH_SYNONYM_KBRS, \ CFG_SITE_LANG, \ CFG_SITE_NAME, \ CFG_LOGDIR, \ CFG_BIBFORMAT_HIDDEN_TAGS, \ CFG_SITE_URL, \ CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS, \ CFG_SOLR_URL, \ CFG_WEBSEARCH_DETAILED_META_FORMAT, \ CFG_SITE_RECORD, \ CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT, \ CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY, \ CFG_BIBSORT_BUCKETS from invenio.search_engine_config import \ InvenioWebSearchUnknownCollectionError, \ InvenioWebSearchWildcardLimitError, \ CFG_WEBSEARCH_IDXPAIRS_FIELDS,\ CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH from invenio.search_engine_utils import get_fieldvalues from invenio.bibrecord import create_record from invenio.bibrank_record_sorter import get_bibrank_methods, is_method_valid, rank_records as rank_records_bibrank from invenio.bibrank_downloads_similarity import register_page_view_event, calculate_reading_similarity_list from invenio.bibindex_engine_stemmer import stem from invenio.bibindex_engine_tokenizer import author_name_requires_phrase_search, \ BibIndexPairTokenizer from invenio.bibindex_engine_washer import wash_index_term, lower_index_term, wash_author_name from invenio.bibformat import format_record, format_records, get_output_format_content_type, create_excel from invenio.bibformat_config import CFG_BIBFORMAT_USE_OLD_BIBFORMAT from invenio.bibrank_downloads_grapher import create_download_history_graph_and_box from invenio.bibknowledge import get_kbr_values from invenio.data_cacher import DataCacher from invenio.websearch_external_collections import print_external_results_overview, perform_external_collection_search from invenio.access_control_admin import acc_get_action_id from invenio.access_control_config import VIEWRESTRCOLL, \ CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS, \ CFG_ACC_GRANT_VIEWER_RIGHTS_TO_EMAILS_IN_TAGS from invenio.websearchadminlib import get_detailed_page_tabs, get_detailed_page_tabs_counts from invenio.intbitset import intbitset from invenio.dbquery import DatabaseError, deserialize_via_marshal, InvenioDbQueryWildcardLimitError from invenio.access_control_engine import acc_authorize_action from invenio.errorlib import register_exception from invenio.textutils import encode_for_xml, wash_for_utf8, strip_accents from invenio.htmlutils import get_mathjax_header from invenio.htmlutils import nmtoken_from_string import invenio.template webstyle_templates = invenio.template.load('webstyle') webcomment_templates = invenio.template.load('webcomment') from invenio.bibrank_citation_searcher import calculate_cited_by_list, \ calculate_co_cited_with_list, get_records_with_num_cites, get_self_cited_by, \ get_refersto_hitset, get_citedby_hitset from invenio.bibrank_citation_grapher import create_citation_history_graph_and_box from invenio.dbquery import run_sql, run_sql_with_limit, wash_table_column_name, \ get_table_update_time from invenio.webuser import getUid, collect_user_info, session_param_set from invenio.webpage import pageheaderonly, pagefooteronly, create_error_box, write_warning from invenio.messages import gettext_set_language from invenio.search_engine_query_parser import SearchQueryParenthesisedParser, \ SpiresToInvenioSyntaxConverter from invenio import webinterface_handler_config as apache from invenio.solrutils import solr_get_bitset try: import invenio.template websearch_templates = invenio.template.load('websearch') except: pass from invenio.websearch_external_collections import calculate_hosted_collections_results, do_calculate_hosted_collections_results from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_ANTE_SEARCH from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_POST_SEARCH from invenio.websearch_external_collections_config import CFG_EXTERNAL_COLLECTION_MAXRESULTS VIEWRESTRCOLL_ID = acc_get_action_id(VIEWRESTRCOLL) ## global vars: cfg_nb_browse_seen_records = 100 # limit of the number of records to check when browsing certain collection cfg_nicely_ordered_collection_list = 0 # do we propose collection list nicely ordered or alphabetical? ## precompile some often-used regexp for speed reasons: re_word = re.compile('[\s]') re_quotes = re.compile('[\'\"]') re_doublequote = re.compile('\"') re_logical_and = re.compile('\sand\s', re.I) re_logical_or = re.compile('\sor\s', re.I) re_logical_not = re.compile('\snot\s', re.I) re_operators = re.compile(r'\s([\+\-\|])\s') re_pattern_wildcards_after_spaces = re.compile(r'(\s)[\*\%]+') re_pattern_single_quotes = re.compile("'(.*?)'") re_pattern_double_quotes = re.compile("\"(.*?)\"") re_pattern_parens_quotes = re.compile(r'[\'\"]{1}[^\'\"]*(\([^\'\"]*\))[^\'\"]*[\'\"]{1}') re_pattern_regexp_quotes = re.compile("\/(.*?)\/") re_pattern_spaces_after_colon = re.compile(r'(:\s+)') re_pattern_short_words = re.compile(r'([\s\"]\w{1,3})[\*\%]+') re_pattern_space = re.compile("__SPACE__") re_pattern_today = re.compile("\$TODAY\$") re_pattern_parens = re.compile(r'\([^\)]+\s+[^\)]+\)') class RestrictedCollectionDataCacher(DataCacher): def __init__(self): def cache_filler(): ret = [] try: res = run_sql("""SELECT DISTINCT ar.value FROM accROLE_accACTION_accARGUMENT raa JOIN accARGUMENT ar ON raa.id_accARGUMENT = ar.id WHERE ar.keyword = 'collection' AND raa.id_accACTION = %s""", (VIEWRESTRCOLL_ID,), run_on_slave=True) except Exception: # database problems, return empty cache return [] for coll in res: ret.append(coll[0]) return ret def timestamp_verifier(): return max(get_table_update_time('accROLE_accACTION_accARGUMENT'), get_table_update_time('accARGUMENT')) DataCacher.__init__(self, cache_filler, timestamp_verifier) def collection_restricted_p(collection, recreate_cache_if_needed=True): if recreate_cache_if_needed: restricted_collection_cache.recreate_cache_if_needed() return collection in restricted_collection_cache.cache try: restricted_collection_cache.is_ok_p except Exception: restricted_collection_cache = RestrictedCollectionDataCacher() def ziplist(*lists): """Just like zip(), but returns lists of lists instead of lists of tuples Example: zip([f1, f2, f3], [p1, p2, p3], [op1, op2, '']) => [(f1, p1, op1), (f2, p2, op2), (f3, p3, '')] ziplist([f1, f2, f3], [p1, p2, p3], [op1, op2, '']) => [[f1, p1, op1], [f2, p2, op2], [f3, p3, '']] FIXME: This is handy to have, and should live somewhere else, like miscutil.really_useful_functions or something. XXX: Starting in python 2.6, the same can be achieved (faster) by using itertools.izip_longest(); when the minimum recommended Python is bumped, we should use that instead. """ def l(*items): return list(items) return map(l, *lists) def get_permitted_restricted_collections(user_info, recreate_cache_if_needed=True): """Return a list of collection that are restricted but for which the user is authorized.""" if recreate_cache_if_needed: restricted_collection_cache.recreate_cache_if_needed() ret = [] for collection in restricted_collection_cache.cache: if acc_authorize_action(user_info, 'viewrestrcoll', collection=collection)[0] == 0: ret.append(collection) return ret def get_all_restricted_recids(): """ Return the set of all the restricted recids, i.e. the ids of those records which belong to at least one restricted collection. """ ret = intbitset() for collection in restricted_collection_cache.cache: ret |= get_collection_reclist(collection) return ret def get_restricted_collections_for_recid(recid, recreate_cache_if_needed=True): """ Return the list of restricted collection names to which recid belongs. """ if recreate_cache_if_needed: restricted_collection_cache.recreate_cache_if_needed() collection_reclist_cache.recreate_cache_if_needed() return [collection for collection in restricted_collection_cache.cache if recid in get_collection_reclist(collection, recreate_cache_if_needed=False)] def is_user_owner_of_record(user_info, recid): """ Check if the user is owner of the record, i.e. he is the submitter and/or belongs to a owner-like group authorized to 'see' the record. @param user_info: the user_info dictionary that describe the user. @type user_info: user_info dictionary @param recid: the record identifier. @type recid: positive integer @return: True if the user is 'owner' of the record; False otherwise @rtype: bool """ authorized_emails_or_group = [] for tag in CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS: authorized_emails_or_group.extend(get_fieldvalues(recid, tag)) for email_or_group in authorized_emails_or_group: if email_or_group in user_info['group']: return True email = email_or_group.strip().lower() if user_info['email'].strip().lower() == email: return True return False ###FIXME: This method needs to be refactorized def is_user_viewer_of_record(user_info, recid): """ Check if the user is allow to view the record based in the marc tags inside CFG_ACC_GRANT_VIEWER_RIGHTS_TO_EMAILS_IN_TAGS i.e. his email is inside the 506__m tag or he is inside an e-group listed in the 506__m tag @param user_info: the user_info dictionary that describe the user. @type user_info: user_info dictionary @param recid: the record identifier. @type recid: positive integer @return: True if the user is 'allow to view' the record; False otherwise @rtype: bool """ authorized_emails_or_group = [] for tag in CFG_ACC_GRANT_VIEWER_RIGHTS_TO_EMAILS_IN_TAGS: authorized_emails_or_group.extend(get_fieldvalues(recid, tag)) for email_or_group in authorized_emails_or_group: if email_or_group in user_info['group']: return True email = email_or_group.strip().lower() if user_info['email'].strip().lower() == email: return True return False def check_user_can_view_record(user_info, recid): """ Check if the user is authorized to view the given recid. The function grants access in two cases: either user has author rights on this record, or he has view rights to the primary collection this record belongs to. @param user_info: the user_info dictionary that describe the user. @type user_info: user_info dictionary @param recid: the record identifier. @type recid: positive integer @return: (0, ''), when authorization is granted, (>0, 'message') when authorization is not granted @rtype: (int, string) """ policy = CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY.strip().upper() if isinstance(recid, str): recid = int(recid) if record_public_p(recid): ## The record is already known to be public. return (0, '') ## At this point, either webcoll has not yet run or there are some ## restricted collections. Let's see first if the user own the record. if is_user_owner_of_record(user_info, recid): ## Perfect! It's authorized then! return (0, '') if is_user_viewer_of_record(user_info, recid): ## Perfect! It's authorized then! return (0, '') restricted_collections = get_restricted_collections_for_recid(recid, recreate_cache_if_needed=False) if restricted_collections: ## If there are restricted collections the user must be authorized to all/any of them (depending on the policy) auth_code, auth_msg = 0, '' for collection in get_restricted_collections_for_recid(recid, recreate_cache_if_needed=False): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=collection) if auth_code and policy != 'ANY': ## Ouch! the user is not authorized to this collection return (auth_code, auth_msg) elif auth_code == 0 and policy == 'ANY': ## Good! At least one collection is authorized return (0, '') ## Depending on the policy, the user will be either authorized or not return auth_code, auth_msg if is_record_in_any_collection(recid, recreate_cache_if_needed=False): ## the record is not in any restricted collection return (0, '') elif record_exists(recid) > 0: ## We are in the case where webcoll has not run. ## Let's authorize SUPERADMIN (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=None) if auth_code == 0: return (0, '') else: ## Too bad. Let's print a nice message: return (1, """The record you are trying to access has just been submitted to the system and needs to be assigned to the proper collections. It is currently restricted for security reasons until the assignment will be fully completed. Please come back later to properly access this record.""") else: ## The record either does not exists or has been deleted. ## Let's handle these situations outside of this code. return (0, '') class IndexStemmingDataCacher(DataCacher): """ Provides cache for stemming information for word/phrase indexes. This class is not to be used directly; use function get_index_stemming_language() instead. """ def __init__(self): def cache_filler(): try: res = run_sql("""SELECT id, stemming_language FROM idxINDEX""") except DatabaseError: # database problems, return empty cache return {} return dict(res) def timestamp_verifier(): return get_table_update_time('idxINDEX') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: index_stemming_cache.is_ok_p except Exception: index_stemming_cache = IndexStemmingDataCacher() def get_index_stemming_language(index_id, recreate_cache_if_needed=True): """Return stemming langugage for given index.""" if recreate_cache_if_needed: index_stemming_cache.recreate_cache_if_needed() return index_stemming_cache.cache[index_id] class CollectionRecListDataCacher(DataCacher): """ Provides cache for collection reclist hitsets. This class is not to be used directly; use function get_collection_reclist() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT name FROM collection") except Exception: # database problems, return empty cache return {} for name in res: ret[name[0]] = None # this will be filled later during runtime by calling get_collection_reclist(coll) return ret def timestamp_verifier(): return get_table_update_time('collection') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not collection_reclist_cache.is_ok_p: raise Exception except Exception: collection_reclist_cache = CollectionRecListDataCacher() def get_collection_reclist(coll, recreate_cache_if_needed=True): """Return hitset of recIDs that belong to the collection 'coll'.""" if recreate_cache_if_needed: collection_reclist_cache.recreate_cache_if_needed() if coll not in collection_reclist_cache.cache: return intbitset() # collection does not exist; return empty set if not collection_reclist_cache.cache[coll]: # collection's reclist not in the cache yet, so calculate it # and fill the cache: reclist = intbitset() query = "SELECT nbrecs,reclist FROM collection WHERE name=%s" res = run_sql(query, (coll, ), 1) if res: try: reclist = intbitset(res[0][1]) except: pass collection_reclist_cache.cache[coll] = reclist # finally, return reclist: return collection_reclist_cache.cache[coll] def get_available_output_formats(visible_only=False): """ Return the list of available output formats. When visible_only is True, returns only those output formats that have visibility flag set to 1. """ formats = [] query = "SELECT code,name FROM format" if visible_only: query += " WHERE visibility='1'" query += " ORDER BY name ASC" res = run_sql(query) if res: # propose found formats: for code, name in res: formats.append({ 'value' : code, 'text' : name }) else: formats.append({'value' : 'hb', 'text' : "HTML brief" }) return formats class SearchResultsCache(DataCacher): """ Provides temporary lazy cache for Search Results. Useful when users click on `next page'. """ def __init__(self): def cache_filler(): return {} def timestamp_verifier(): return '1970-01-01 00:00:00' # lazy cache is always okay; # its filling is governed by # CFG_WEBSEARCH_SEARCH_CACHE_SIZE DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not search_results_cache.is_ok_p: raise Exception except Exception: search_results_cache = SearchResultsCache() class CollectionI18nNameDataCacher(DataCacher): """ Provides cache for I18N collection names. This class is not to be used directly; use function get_coll_i18nname() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT c.name,cn.ln,cn.value FROM collectionname AS cn, collection AS c WHERE cn.id_collection=c.id AND cn.type='ln'") # ln=long name except Exception: # database problems return {} for c, ln, i18nname in res: if i18nname: if not ret.has_key(c): ret[c] = {} ret[c][ln] = i18nname return ret def timestamp_verifier(): return get_table_update_time('collectionname') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not collection_i18nname_cache.is_ok_p: raise Exception except Exception: collection_i18nname_cache = CollectionI18nNameDataCacher() def get_coll_i18nname(c, ln=CFG_SITE_LANG, verify_cache_timestamp=True): """ Return nicely formatted collection name (of the name type `ln' (=long name)) for collection C in language LN. This function uses collection_i18nname_cache, but it verifies whether the cache is up-to-date first by default. This verification step is performed by checking the DB table update time. So, if you call this function 1000 times, it can get very slow because it will do 1000 table update time verifications, even though collection names change not that often. Hence the parameter VERIFY_CACHE_TIMESTAMP which, when set to False, will assume the cache is already up-to-date. This is useful namely in the generation of collection lists for the search results page. """ if verify_cache_timestamp: collection_i18nname_cache.recreate_cache_if_needed() out = c try: out = collection_i18nname_cache.cache[c][ln] except KeyError: pass # translation in LN does not exist return out class FieldI18nNameDataCacher(DataCacher): """ Provides cache for I18N field names. This class is not to be used directly; use function get_field_i18nname() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT f.name,fn.ln,fn.value FROM fieldname AS fn, field AS f WHERE fn.id_field=f.id AND fn.type='ln'") # ln=long name except Exception: # database problems, return empty cache return {} for f, ln, i18nname in res: if i18nname: if not ret.has_key(f): ret[f] = {} ret[f][ln] = i18nname return ret def timestamp_verifier(): return get_table_update_time('fieldname') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not field_i18nname_cache.is_ok_p: raise Exception except Exception: field_i18nname_cache = FieldI18nNameDataCacher() def get_field_i18nname(f, ln=CFG_SITE_LANG, verify_cache_timestamp=True): """ Return nicely formatted field name (of type 'ln', 'long name') for field F in language LN. If VERIFY_CACHE_TIMESTAMP is set to True, then verify DB timestamp and field I18N name cache timestamp and refresh cache from the DB if needed. Otherwise don't bother checking DB timestamp and return the cached value. (This is useful when get_field_i18nname is called inside a loop.) """ if verify_cache_timestamp: field_i18nname_cache.recreate_cache_if_needed() out = f try: out = field_i18nname_cache.cache[f][ln] except KeyError: pass # translation in LN does not exist return out def get_alphabetically_ordered_collection_list(level=0, ln=CFG_SITE_LANG): """Returns nicely ordered (score respected) list of collections, more exactly list of tuples (collection name, printable collection name). Suitable for create_search_box().""" out = [] res = run_sql("SELECT name FROM collection ORDER BY name ASC") for c_name in res: c_name = c_name[0] # make a nice printable name (e.g. truncate c_printable for # long collection names in given language): c_printable_fullname = get_coll_i18nname(c_name, ln, False) c_printable = wash_index_term(c_printable_fullname, 30, False) if c_printable != c_printable_fullname: c_printable = c_printable + "..." if level: c_printable = " " + level * '-' + " " + c_printable out.append([c_name, c_printable]) return out def get_nicely_ordered_collection_list(collid=1, level=0, ln=CFG_SITE_LANG): """Returns nicely ordered (score respected) list of collections, more exactly list of tuples (collection name, printable collection name). Suitable for create_search_box().""" colls_nicely_ordered = [] res = run_sql("""SELECT c.name,cc.id_son FROM collection_collection AS cc, collection AS c WHERE c.id=cc.id_son AND cc.id_dad=%s ORDER BY score DESC""", (collid, )) for c, cid in res: # make a nice printable name (e.g. truncate c_printable for # long collection names in given language): c_printable_fullname = get_coll_i18nname(c, ln, False) c_printable = wash_index_term(c_printable_fullname, 30, False) if c_printable != c_printable_fullname: c_printable = c_printable + "..." if level: c_printable = " " + level * '-' + " " + c_printable colls_nicely_ordered.append([c, c_printable]) colls_nicely_ordered = colls_nicely_ordered + get_nicely_ordered_collection_list(cid, level+1, ln=ln) return colls_nicely_ordered def get_index_id_from_field(field): """ Return index id with name corresponding to FIELD, or the first index id where the logical field code named FIELD is indexed. Return zero in case there is no index defined for this field. Example: field='author', output=4. """ out = 0 if not field: field = 'global' # empty string field means 'global' index (field 'anyfield') # first look in the index table: res = run_sql("""SELECT id FROM idxINDEX WHERE name=%s""", (field,)) if res: out = res[0][0] return out # not found in the index table, now look in the field table: res = run_sql("""SELECT w.id FROM idxINDEX AS w, idxINDEX_field AS wf, field AS f WHERE f.code=%s AND wf.id_field=f.id AND w.id=wf.id_idxINDEX LIMIT 1""", (field,)) if res: out = res[0][0] return out def get_words_from_pattern(pattern): "Returns list of whitespace-separated words from pattern." words = {} for word in string.split(pattern): if not words.has_key(word): words[word] = 1 return words.keys() def create_basic_search_units(req, p, f, m=None, of='hb'): """Splits search pattern and search field into a list of independently searchable units. - A search unit consists of '(operator, pattern, field, type, hitset)' tuples where 'operator' is set union (|), set intersection (+) or set exclusion (-); 'pattern' is either a word (e.g. muon*) or a phrase (e.g. 'nuclear physics'); 'field' is either a code like 'title' or MARC tag like '100__a'; 'type' is the search type ('w' for word file search, 'a' for access file search). - Optionally, the function accepts the match type argument 'm'. If it is set (e.g. from advanced search interface), then it performs this kind of matching. If it is not set, then a guess is made. 'm' can have values: 'a'='all of the words', 'o'='any of the words', 'p'='phrase/substring', 'r'='regular expression', 'e'='exact value'. - Warnings are printed on req (when not None) in case of HTML output formats.""" opfts = [] # will hold (o,p,f,t,h) units # FIXME: quick hack for the journal index if f == 'journal': opfts.append(['+', p, f, 'w']) return opfts ## check arguments: is desired matching type set? if m: ## A - matching type is known; good! if m == 'e': # A1 - exact value: opfts.append(['+', p, f, 'a']) # '+' since we have only one unit elif m == 'p': # A2 - phrase/substring: opfts.append(['+', "%" + p + "%", f, 'a']) # '+' since we have only one unit elif m == 'r': # A3 - regular expression: opfts.append(['+', p, f, 'r']) # '+' since we have only one unit elif m == 'a' or m == 'w': # A4 - all of the words: p = strip_accents(p) # strip accents for 'w' mode, FIXME: delete when not needed for word in get_words_from_pattern(p): opfts.append(['+', word, f, 'w']) # '+' in all units elif m == 'o': # A5 - any of the words: p = strip_accents(p) # strip accents for 'w' mode, FIXME: delete when not needed for word in get_words_from_pattern(p): if len(opfts)==0: opfts.append(['+', word, f, 'w']) # '+' in the first unit else: opfts.append(['|', word, f, 'w']) # '|' in further units else: if of.startswith("h"): write_warning("Matching type '%s' is not implemented yet." % cgi.escape(m), "Warning", req=req) opfts.append(['+', "%" + p + "%", f, 'w']) else: ## B - matching type is not known: let us try to determine it by some heuristics if f and p[0] == '"' and p[-1] == '"': ## B0 - does 'p' start and end by double quote, and is 'f' defined? => doing ACC search opfts.append(['+', p[1:-1], f, 'a']) elif f in ('author', 'firstauthor', 'exactauthor', 'exactfirstauthor') and author_name_requires_phrase_search(p): ## B1 - do we search in author, and does 'p' contain space/comma/dot/etc? ## => doing washed ACC search opfts.append(['+', p, f, 'a']) elif f and p[0] == "'" and p[-1] == "'": ## B0bis - does 'p' start and end by single quote, and is 'f' defined? => doing ACC search opfts.append(['+', '%' + p[1:-1] + '%', f, 'a']) elif f and p[0] == "/" and p[-1] == "/": ## B0ter - does 'p' start and end by a slash, and is 'f' defined? => doing regexp search opfts.append(['+', p[1:-1], f, 'r']) elif f and string.find(p, ',') >= 0: ## B1 - does 'p' contain comma, and is 'f' defined? => doing ACC search opfts.append(['+', p, f, 'a']) elif f and str(f[0:2]).isdigit(): ## B2 - does 'f' exist and starts by two digits? => doing ACC search opfts.append(['+', p, f, 'a']) else: ## B3 - doing WRD search, but maybe ACC too # search units are separated by spaces unless the space is within single or double quotes # so, let us replace temporarily any space within quotes by '__SPACE__' p = re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", p) p = re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", p) p = re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", p) # and spaces after colon as well: p = re_pattern_spaces_after_colon.sub(lambda x: string.replace(x.group(1), ' ', '__SPACE__'), p) # wash argument: p = re_logical_and.sub(" ", p) p = re_logical_or.sub(" |", p) p = re_logical_not.sub(" -", p) p = re_operators.sub(r' \1', p) for pi in string.split(p): # iterate through separated units (or items, as "pi" stands for "p item") pi = re_pattern_space.sub(" ", pi) # replace back '__SPACE__' by ' ' # firstly, determine set operator if pi[0] == '+' or pi[0] == '-' or pi[0] == '|': oi = pi[0] pi = pi[1:] else: # okay, there is no operator, so let us decide what to do by default oi = '+' # by default we are doing set intersection... # secondly, determine search pattern and field: if string.find(pi, ":") > 0: fi, pi = string.split(pi, ":", 1) fi = wash_field(fi) # test whether fi is a real index code or a MARC-tag defined code: if fi in get_fieldcodes() or '00' <= fi[:2] <= '99': pass else: # it is not, so join it back: fi, pi = f, fi + ":" + pi else: fi, pi = f, pi # wash 'fi' argument: fi = wash_field(fi) # wash 'pi' argument: pi = pi.strip() # strip eventual spaces if re_quotes.match(pi): # B3a - quotes are found => do ACC search (phrase search) if pi[0] == '"' and pi[-1] == '"': pi = string.replace(pi, '"', '') # remove quote signs opfts.append([oi, pi, fi, 'a']) elif pi[0] == "'" and pi[-1] == "'": pi = string.replace(pi, "'", "") # remove quote signs opfts.append([oi, "%" + pi + "%", fi, 'a']) else: # unbalanced quotes, so fall back to WRD query: opfts.append([oi, pi, fi, 'w']) elif pi.startswith('/') and pi.endswith('/'): # B3b - pi has slashes around => do regexp search opfts.append([oi, pi[1:-1], fi, 'r']) elif fi and len(fi) > 1 and str(fi[0]).isdigit() and str(fi[1]).isdigit(): # B3c - fi exists and starts by two digits => do ACC search opfts.append([oi, pi, fi, 'a']) elif fi and not get_index_id_from_field(fi) and get_field_name(fi): # B3d - logical field fi exists but there is no WRD index for fi => try ACC search opfts.append([oi, pi, fi, 'a']) else: # B3e - general case => do WRD search pi = strip_accents(pi) # strip accents for 'w' mode, FIXME: delete when not needed for pii in get_words_from_pattern(pi): opfts.append([oi, pii, fi, 'w']) ## sanity check: for i in range(0, len(opfts)): try: pi = opfts[i][1] if pi == '*': if of.startswith("h"): write_warning("Ignoring standalone wildcard word.", "Warning", req=req) del opfts[i] if pi == '' or pi == ' ': fi = opfts[i][2] if fi: if of.startswith("h"): write_warning("Ignoring empty %s search term." % fi, "Warning", req=req) del opfts[i] except: pass ## replace old logical field names if applicable: if CFG_WEBSEARCH_FIELDS_CONVERT: opfts = [[o, p, wash_field(f), t] for o, p, f, t in opfts] ## return search units: return opfts def page_start(req, of, cc, aas, ln, uid, title_message=None, description='', keywords='', recID=-1, tab='', p=''): """ Start page according to given output format. @param title_message: title of the page, not escaped for HTML @param description: description of the page, not escaped for HTML @param keywords: keywords of the page, not escaped for HTML """ _ = gettext_set_language(ln) if not req or isinstance(req, cStringIO.OutputType): return # we were called from CLI if not title_message: title_message = _("Search Results") content_type = get_output_format_content_type(of) if of.startswith('x'): if of == 'xr': # we are doing RSS output req.content_type = "application/rss+xml" req.send_http_header() req.write("""\n""") else: # we are doing XML output: req.content_type = "text/xml" req.send_http_header() req.write("""\n""") elif of.startswith('t') or str(of[0:3]).isdigit(): # we are doing plain text output: req.content_type = "text/plain" req.send_http_header() elif of == "id": pass # nothing to do, we shall only return list of recIDs elif content_type == 'text/html': # we are doing HTML output: req.content_type = "text/html" req.send_http_header() if not description: description = "%s %s." % (cc, _("Search Results")) if not keywords: keywords = "%s, WebSearch, %s" % (get_coll_i18nname(CFG_SITE_NAME, ln, False), get_coll_i18nname(cc, ln, False)) ## generate RSS URL: argd = {} if req.args: argd = cgi.parse_qs(req.args) rssurl = websearch_templates.build_rss_url(argd) ## add MathJax if displaying single records (FIXME: find ## eventual better place to this code) if of.lower() in CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS: metaheaderadd = get_mathjax_header(req.is_https()) else: metaheaderadd = '' # Add metadata in meta tags for Google scholar-esque harvesting... # only if we have a detailed meta format and we are looking at a # single record if (recID != -1 and CFG_WEBSEARCH_DETAILED_META_FORMAT): metaheaderadd += format_record(recID, \ CFG_WEBSEARCH_DETAILED_META_FORMAT, \ ln = ln) ## generate navtrail: navtrail = create_navtrail_links(cc, aas, ln) if navtrail != '': navtrail += ' > ' if (tab != '' or ((of != '' or of.lower() != 'hd') and of != 'hb')) and \ recID != -1: # If we are not in information tab in HD format, customize # the nav. trail to have a link back to main record. (Due # to the way perform_request_search() works, hb # (lowercase) is equal to hd) navtrail += ' %s' % \ (CFG_SITE_URL, CFG_SITE_RECORD, recID, cgi.escape(title_message)) if (of != '' or of.lower() != 'hd') and of != 'hb': # Export format_name = of query = "SELECT name FROM format WHERE code=%s" res = run_sql(query, (of,)) if res: format_name = res[0][0] navtrail += ' > ' + format_name else: # Discussion, citations, etc. tabs tab_label = get_detailed_page_tabs(cc, ln=ln)[tab]['label'] navtrail += ' > ' + _(tab_label) else: navtrail += cgi.escape(title_message) if p: # we are serving search/browse results pages, so insert pattern: navtrail += ": " + cgi.escape(p) title_message = p + " - " + title_message body_css_classes = [] if cc: # we know the collection, lets allow page styles based on cc #collection names may not satisfy rules for css classes which #are something like: -?[_a-zA-Z]+[_a-zA-Z0-9-]* #however it isn't clear what we should do about cases with #numbers, so we leave them to fail. Everything else becomes "_" css = nmtoken_from_string(cc).replace('.','_').replace('-','_').replace(':','_') body_css_classes.append(css) ## finally, print page header: req.write(pageheaderonly(req=req, title=title_message, navtrail=navtrail, description=description, keywords=keywords, metaheaderadd=metaheaderadd, uid=uid, language=ln, navmenuid='search', navtrail_append_title_p=0, rssurl=rssurl, body_css_classes=body_css_classes)) req.write(websearch_templates.tmpl_search_pagestart(ln=ln)) #else: # req.send_http_header() def page_end(req, of="hb", ln=CFG_SITE_LANG): "End page according to given output format: e.g. close XML tags, add HTML footer, etc." if of == "id": return [] # empty recID list if not req: return # we were called from CLI if of.startswith('h'): req.write(websearch_templates.tmpl_search_pageend(ln = ln)) # pagebody end req.write(pagefooteronly(lastupdated=__lastupdated__, language=ln, req=req)) return def create_page_title_search_pattern_info(p, p1, p2, p3): """Create the search pattern bit for the page
No match close to %s found in given collections. #Please try different term.
Displaying matches in any collection...""" % p_orig)
## try to get nbhits for these phrases in any collection:
for phrase in browsed_phrases:
browsed_phrases_in_colls.append([phrase, get_nbhits_in_bibxxx(phrase, f)])
## display results now:
out = websearch_templates.tmpl_browse_pattern(
f=f,
fn=get_field_i18nname(get_field_name(f) or f, ln, False),
ln=ln,
browsed_phrases_in_colls=browsed_phrases_in_colls,
colls=colls,
rg=rg,
)
req.write(out)
return
def browse_in_bibwords(req, p, f, ln=CFG_SITE_LANG):
"""Browse inside words indexes."""
if not p:
return
_ = gettext_set_language(ln)
urlargd = {}
urlargd.update(req.argd)
urlargd['action'] = 'search'
nearest_box = create_nearest_terms_box(urlargd, p, f, 'w', ln=ln, intro_text_p=0)
req.write(websearch_templates.tmpl_search_in_bibwords(
p = p,
f = f,
ln = ln,
nearest_box = nearest_box
))
return
def search_pattern(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0, ln=CFG_SITE_LANG, display_nearest_terms_box=True, wl=0):
"""Search for complex pattern 'p' within field 'f' according to
matching type 'm'. Return hitset of recIDs.
The function uses multi-stage searching algorithm in case of no
exact match found. See the Search Internals document for
detailed description.
The 'ap' argument governs whether an alternative patterns are to
be used in case there is no direct hit for (p,f,m). For
example, whether to replace non-alphanumeric characters by
spaces if it would give some hits. See the Search Internals
document for detailed description. (ap=0 forbits the
alternative pattern usage, ap=1 permits it.)
'ap' is also internally used for allowing hidden tag search
(for requests coming from webcoll, for example). In this
- case ap=9
+ case ap=-9
The 'of' argument governs whether to print or not some
information to the user in case of no match found. (Usually it
prints the information in case of HTML formats, otherwise it's
silent).
The 'verbose' argument controls the level of debugging information
to be printed (0=least, 9=most).
All the parameters are assumed to have been previously washed.
This function is suitable as a mid-level API.
"""
_ = gettext_set_language(ln)
hitset_empty = intbitset()
# sanity check:
if not p:
hitset_full = intbitset(trailing_bits=1)
hitset_full.discard(0)
# no pattern, so return all universe
return hitset_full
# search stage 1: break up arguments into basic search units:
if verbose and of.startswith("h"):
t1 = os.times()[4]
basic_search_units = create_basic_search_units(req, p, f, m, of)
if verbose and of.startswith("h"):
t2 = os.times()[4]
write_warning("Search stage 1: basic search units are: %s" % cgi.escape(repr(basic_search_units)), req=req)
write_warning("Search stage 1: execution took %.2f seconds." % (t2 - t1), req=req)
# search stage 2: do search for each search unit and verify hit presence:
if verbose and of.startswith("h"):
t1 = os.times()[4]
basic_search_units_hitsets = []
#prepare hiddenfield-related..
myhiddens = CFG_BIBFORMAT_HIDDEN_TAGS
can_see_hidden = False
if req:
user_info = collect_user_info(req)
can_see_hidden = user_info.get('precached_canseehiddenmarctags', False)
- if not req and ap == 9: # special request, coming from webcoll
+ if not req and ap == -9: # special request, coming from webcoll
can_see_hidden = True
if can_see_hidden:
myhiddens = []
if CFG_INSPIRE_SITE and of.startswith('h'):
# fulltext/caption search warnings for INSPIRE:
fields_to_be_searched = [f for o, p, f, m in basic_search_units]
if 'fulltext' in fields_to_be_searched:
write_warning( _("Warning: full-text search is only available for a subset of papers mostly from %(x_range_from_year)s-%(x_range_to_year)s.") % \
{'x_range_from_year': '2006',
'x_range_to_year': '2012'}, req=req)
elif 'caption' in fields_to_be_searched:
write_warning(_("Warning: figure caption search is only available for a subset of papers mostly from %(x_range_from_year)s-%(x_range_to_year)s.") % \
{'x_range_from_year': '2008',
'x_range_to_year': '2012'}, req=req)
for idx_unit in xrange(len(basic_search_units)):
bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit]
if bsu_f and len(bsu_f) < 2:
if of.startswith("h"):
write_warning(_("There is no index %s. Searching for %s in all fields." % (bsu_f, bsu_p)), req=req)
bsu_f = ''
bsu_m = 'w'
if of.startswith("h") and verbose:
write_warning(_('Instead searching %s.' % str([bsu_o, bsu_p, bsu_f, bsu_m])), req=req)
try:
basic_search_unit_hitset = search_unit(bsu_p, bsu_f, bsu_m, wl)
except InvenioWebSearchWildcardLimitError, excp:
basic_search_unit_hitset = excp.res
if of.startswith("h"):
write_warning(_("Search term too generic, displaying only partial results..."), req=req)
# FIXME: print warning if we use native full-text indexing
if bsu_f == 'fulltext' and bsu_m != 'w' and of.startswith('h') and not CFG_SOLR_URL:
write_warning(_("No phrase index available for fulltext yet, looking for word combination..."), req=req)
#check that the user is allowed to search with this tag
#if he/she tries it
if bsu_f and len(bsu_f) > 1 and bsu_f[0].isdigit() and bsu_f[1].isdigit():
for htag in myhiddens:
ltag = len(htag)
samelenfield = bsu_f[0:ltag]
if samelenfield == htag: #user searches by a hidden tag
#we won't show you anything..
basic_search_unit_hitset = intbitset()
if verbose >= 9 and of.startswith("h"):
write_warning("Pattern %s hitlist omitted since \
it queries in a hidden tag %s" %
(cgi.escape(repr(bsu_p)), repr(myhiddens)), req=req)
display_nearest_terms_box = False #..and stop spying, too.
if verbose >= 9 and of.startswith("h"):
write_warning("Search stage 1: pattern %s gave hitlist %s" % (cgi.escape(bsu_p), basic_search_unit_hitset), req=req)
if len(basic_search_unit_hitset) > 0 or \
- ap==0 or \
+ ap<1 or \
bsu_o=="|" or \
((idx_unit+1) clear search results cache""" % CFG_SITE_URL
out += " Date: %d " % yyyymmdd)
req.write("""
No results found."
if search_timed_out:
return "
The search engine did not respond in time."
return websearch_templates.tmpl_print_hosted_results(
url_and_engine=url_and_engine,
ln=ln,
of=of,
req=req,
limit=limit
)
class BibSortDataCacher(DataCacher):
"""
Cache holding all structures created by bibsort
( _data, data_dict).
"""
def __init__(self, method_name):
self.method_name = method_name
self.method_id = 0
try:
res = run_sql("""SELECT id from bsrMETHOD where name = %s""", (self.method_name,))
except:
self.method_id = 0
if res and res[0]:
self.method_id = res[0][0]
else:
self.method_id = 0
def cache_filler():
method_id = self.method_id
alldicts = {}
if self.method_id == 0:
return {}
try:
res_data = run_sql("""SELECT data_dict_ordered from bsrMETHODDATA \
where id_bsrMETHOD = %s""", (method_id,))
res_buckets = run_sql("""SELECT bucket_no, bucket_data from bsrMETHODDATABUCKET\
where id_bsrMETHOD = %s""", (method_id,))
except Exception:
# database problems, return empty cache
return {}
try:
data_dict_ordered = deserialize_via_marshal(res_data[0][0])
except:
data_dict_ordered = {}
alldicts['data_dict_ordered'] = data_dict_ordered # recid: weight
if not res_buckets:
alldicts['bucket_data'] = {}
return alldicts
for row in res_buckets:
bucket_no = row[0]
try:
bucket_data = intbitset(row[1])
except:
bucket_data = intbitset([])
alldicts.setdefault('bucket_data', {})[bucket_no] = bucket_data
return alldicts
def timestamp_verifier():
method_id = self.method_id
res = run_sql("""SELECT last_updated from bsrMETHODDATA where id_bsrMETHOD = %s""", (method_id,))
try:
update_time_methoddata = str(res[0][0])
except IndexError:
update_time_methoddata = '1970-01-01 00:00:00'
res = run_sql("""SELECT max(last_updated) from bsrMETHODDATABUCKET where id_bsrMETHOD = %s""", (method_id,))
try:
update_time_buckets = str(res[0][0])
except IndexError:
update_time_buckets = '1970-01-01 00:00:00'
return max(update_time_methoddata, update_time_buckets)
DataCacher.__init__(self, cache_filler, timestamp_verifier)
def get_sorting_methods():
if not CFG_BIBSORT_BUCKETS: # we do not want to use buckets
return {}
try: # make sure the method has some data
res = run_sql("""SELECT m.name, m.definition FROM bsrMETHOD m, bsrMETHODDATA md WHERE m.id = md.id_bsrMETHOD""")
except:
return {}
return dict(res)
sorting_methods = get_sorting_methods()
cache_sorted_data = {}
for sorting_method in sorting_methods:
try:
cache_sorted_data[sorting_method].is_ok_p
except Exception:
cache_sorted_data[sorting_method] = BibSortDataCacher(sorting_method)
def get_tags_form_sort_fields(sort_fields):
"""Given a list of sort_fields, return the tags associated with it and
also the name of the field that has no tags associated, to be able to
display a message to the user."""
tags = []
if not sort_fields:
return [], ''
for sort_field in sort_fields:
if sort_field and str(sort_field[0:2]).isdigit():
# sort_field starts by two digits, so this is probably a MARC tag already
tags.append(sort_field)
else:
# let us check the 'field' table
field_tags = get_field_tags(sort_field)
if field_tags:
tags.extend(field_tags)
else:
return [], sort_field
return tags, ''
def rank_records(req, rank_method_code, rank_limit_relevance, hitset_global, pattern=None, verbose=0, sort_order='d', of='hb', ln=CFG_SITE_LANG, rg=None, jrec=None):
"""Initial entry point for ranking records, acts like a dispatcher.
(i) rank_method_code is in bsrMETHOD, bibsort buckets can be used;
(ii)rank_method_code is not in bsrMETHOD, use bibrank;
"""
if CFG_BIBSORT_BUCKETS and sorting_methods:
for sort_method in sorting_methods:
definition = sorting_methods[sort_method]
if definition.startswith('RNK') and \
definition.replace('RNK:','').strip().lower() == string.lower(rank_method_code):
(solution_recs, solution_scores) = sort_records_bibsort(req, hitset_global, sort_method, '', sort_order, verbose, of, ln, rg, jrec, 'r')
#return (solution_recs, solution_scores, '', '', '')
comment = ''
if verbose > 0:
comment = 'find_citations retlist %s' % [[solution_recs[i], solution_scores[i]] for i in range(len(solution_recs))]
return (solution_recs, solution_scores, '(', ')', comment)
return rank_records_bibrank(rank_method_code, rank_limit_relevance, hitset_global, pattern, verbose)
def sort_records(req, recIDs, sort_field='', sort_order='d', sort_pattern='', verbose=0, of='hb', ln=CFG_SITE_LANG, rg=None, jrec=None):
"""Initial entry point for sorting records, acts like a dispatcher.
(i) sort_field is in the bsrMETHOD, and thus, the BibSort has sorted the data for this field, so we can use the cache;
(ii)sort_field is not in bsrMETHOD, and thus, the cache does not contain any information regarding this sorting method"""
_ = gettext_set_language(ln)
#we should return sorted records up to irec_max(exclusive)
dummy, irec_max = get_interval_for_records_to_sort(len(recIDs), jrec, rg)
#calculate the min index on the reverted list
index_min = max(len(recIDs) - irec_max, 0) #just to be sure that the min index is not negative
#bibsort does not handle sort_pattern for now, use bibxxx
if sort_pattern:
return sort_records_bibxxx(req, recIDs, None, sort_field, sort_order, sort_pattern, verbose, of, ln, rg, jrec)
use_sorting_buckets = True
if not CFG_BIBSORT_BUCKETS or not sorting_methods: #ignore the use of buckets, use old fashion sorting
use_sorting_buckets = False
if not sort_field:
if use_sorting_buckets:
return sort_records_bibsort(req, recIDs, 'latest first', sort_field, sort_order, verbose, of, ln, rg, jrec)
else:
return recIDs[index_min:]
sort_fields = string.split(sort_field, ",")
if len(sort_fields) == 1:
# we have only one sorting_field, check if it is treated by BibSort
for sort_method in sorting_methods:
definition = sorting_methods[sort_method]
if use_sorting_buckets and \
((definition.startswith('FIELD') and \
definition.replace('FIELD:','').strip().lower() == string.lower(sort_fields[0])) or \
sort_method == sort_fields[0]):
#use BibSort
return sort_records_bibsort(req, recIDs, sort_method, sort_field, sort_order, verbose, of, ln, rg, jrec)
#deduce sorting MARC tag out of the 'sort_field' argument:
tags, error_field = get_tags_form_sort_fields(sort_fields)
if error_field:
if use_sorting_buckets:
return sort_records_bibsort(req, recIDs, 'latest first', sort_field, sort_order, verbose, of, ln, rg, jrec)
else:
if of.startswith('h'):
write_warning(_("Sorry, %s does not seem to be a valid sort option. The records will not be sorted.") % cgi.escape(error_field), "Error", req=req)
return recIDs[index_min:]
if tags:
for sort_method in sorting_methods:
definition = sorting_methods[sort_method]
if definition.startswith('MARC') \
and definition.replace('MARC:','').strip().split(',') == tags \
and use_sorting_buckets:
#this list of tags have a designated method in BibSort, so use it
return sort_records_bibsort(req, recIDs, sort_method, sort_field, sort_order, verbose, of, ln, rg, jrec)
#we do not have this sort_field in BibSort tables -> do the old fashion sorting
return sort_records_bibxxx(req, recIDs, tags, sort_field, sort_order, sort_pattern, verbose, of, ln, rg, jrec)
return recIDs[index_min:]
def sort_records_bibsort(req, recIDs, sort_method, sort_field='', sort_order='d', verbose=0, of='hb', ln=CFG_SITE_LANG, rg=None, jrec=None, sort_or_rank = 's'):
"""This function orders the recIDs list, based on a sorting method(sort_field) using the BibSortDataCacher for speed"""
_ = gettext_set_language(ln)
#sanity check
if sort_method not in sorting_methods:
if sort_or_rank == 'r':
return rank_records_bibrank(sort_method, 0, recIDs, None, verbose)
else:
return sort_records_bibxxx(req, recIDs, None, sort_field, sort_order, '', verbose, of, ln, rg, jrec)
if verbose >= 3 and of.startswith('h'):
write_warning("Sorting (using BibSort cache) by method %s (definition %s)." \
% (cgi.escape(repr(sort_method)), cgi.escape(repr(sorting_methods[sort_method]))), req=req)
#we should return sorted records up to irec_max(exclusive)
dummy, irec_max = get_interval_for_records_to_sort(len(recIDs), jrec, rg)
solution = intbitset([])
input_recids = intbitset(recIDs)
cache_sorted_data[sort_method].recreate_cache_if_needed()
sort_cache = cache_sorted_data[sort_method].cache
bucket_numbers = sort_cache['bucket_data'].keys()
#check if all buckets have been constructed
if len(bucket_numbers) != CFG_BIBSORT_BUCKETS:
if verbose > 3 and of.startswith('h'):
write_warning("Not all buckets have been constructed.. switching to old fashion sorting.", req=req)
if sort_or_rank == 'r':
return rank_records_bibrank(sort_method, 0, recIDs, None, verbose)
else:
return sort_records_bibxxx(req, recIDs, None, sort_field, sort_order, '', verbose, of, ln, rg, jrec)
if sort_order == 'd':
bucket_numbers.reverse()
for bucket_no in bucket_numbers:
solution.union_update(input_recids & sort_cache['bucket_data'][bucket_no])
if len(solution) >= irec_max:
break
dict_solution = {}
missing_records = []
for recid in solution:
try:
dict_solution[recid] = sort_cache['data_dict_ordered'][recid]
except KeyError:
#recid is in buckets, but not in the bsrMETHODDATA,
#maybe because the value has been deleted, but the change has not yet been propagated to the buckets
missing_records.append(recid)
#check if there are recids that are not in any bucket -> to be added at the end/top, ordered by insertion date
if len(solution) < irec_max:
#some records have not been yet inserted in the bibsort structures
#or, some records have no value for the sort_method
missing_records = sorted(missing_records + list(input_recids.difference(solution)))
#the records need to be sorted in reverse order for the print record function
#the return statement should be equivalent with the following statements
#(these are clearer, but less efficient, since they revert the same list twice)
#sorted_solution = (missing_records + sorted(dict_solution, key=dict_solution.__getitem__, reverse=sort_order=='d'))[:irec_max]
#sorted_solution.reverse()
#return sorted_solution
if sort_method.strip().lower().startswith('latest') and sort_order == 'd':
# if we want to sort the records on their insertion date, add the mission records at the top
solution = sorted(dict_solution, key=dict_solution.__getitem__, reverse=sort_order=='a') + missing_records
else:
solution = missing_records + sorted(dict_solution, key=dict_solution.__getitem__, reverse=sort_order=='a')
#calculate the min index on the reverted list
index_min = max(len(solution) - irec_max, 0) #just to be sure that the min index is not negative
#return all the records up to irec_max, but on the reverted list
if sort_or_rank == 'r':
# we need the recids, with values
return (solution[index_min:], [dict_solution.get(record, 0) for record in solution[index_min:]])
else:
return solution[index_min:]
def sort_records_bibxxx(req, recIDs, tags, sort_field='', sort_order='d', sort_pattern='', verbose=0, of='hb', ln=CFG_SITE_LANG, rg=None, jrec=None):
"""OLD FASHION SORTING WITH NO CACHE, for sort fields that are not run in BibSort
Sort records in 'recIDs' list according sort field 'sort_field' in order 'sort_order'.
If more than one instance of 'sort_field' is found for a given record, try to choose that that is given by
'sort pattern', for example "sort by report number that starts by CERN-PS".
Note that 'sort_field' can be field code like 'author' or MARC tag like '100__a' directly."""
_ = gettext_set_language(ln)
#we should return sorted records up to irec_max(exclusive)
dummy, irec_max = get_interval_for_records_to_sort(len(recIDs), jrec, rg)
#calculate the min index on the reverted list
index_min = max(len(recIDs) - irec_max, 0) #just to be sure that the min index is not negative
## check arguments:
if not sort_field:
return recIDs[index_min:]
if len(recIDs) > CFG_WEBSEARCH_NB_RECORDS_TO_SORT:
if of.startswith('h'):
write_warning(_("Sorry, sorting is allowed on sets of up to %d records only. Using default sort order.") % CFG_WEBSEARCH_NB_RECORDS_TO_SORT, "Warning", req=req)
return recIDs[index_min:]
recIDs_dict = {}
recIDs_out = []
if not tags:
# tags have not been camputed yet
sort_fields = string.split(sort_field, ",")
tags, error_field = get_tags_form_sort_fields(sort_fields)
if error_field:
if of.startswith('h'):
write_warning(_("Sorry, %s does not seem to be a valid sort option. The records will not be sorted.") % cgi.escape(error_field), "Error", req=req)
return recIDs[index_min:]
if verbose >= 3 and of.startswith('h'):
write_warning("Sorting by tags %s." % cgi.escape(repr(tags)), req=req)
if sort_pattern:
write_warning("Sorting preferentially by %s." % cgi.escape(sort_pattern), req=req)
## check if we have sorting tag defined:
if tags:
# fetch the necessary field values:
for recID in recIDs:
val = "" # will hold value for recID according to which sort
vals = [] # will hold all values found in sorting tag for recID
for tag in tags:
if CFG_CERN_SITE and tag == '773__c':
# CERN hack: journal sorting
# 773__c contains page numbers, e.g. 3-13, and we want to sort by 3, and numerically:
vals.extend(["%050s" % x.split("-", 1)[0] for x in get_fieldvalues(recID, tag)])
else:
vals.extend(get_fieldvalues(recID, tag))
if sort_pattern:
# try to pick that tag value that corresponds to sort pattern
bingo = 0
for v in vals:
if v.lower().startswith(sort_pattern.lower()): # bingo!
bingo = 1
val = v
break
if not bingo: # sort_pattern not present, so add other vals after spaces
val = sort_pattern + " " + string.join(vals)
else:
# no sort pattern defined, so join them all together
val = string.join(vals)
val = strip_accents(val.lower()) # sort values regardless of accents and case
if recIDs_dict.has_key(val):
recIDs_dict[val].append(recID)
else:
recIDs_dict[val] = [recID]
# sort them:
recIDs_dict_keys = recIDs_dict.keys()
recIDs_dict_keys.sort()
# now that keys are sorted, create output array:
for k in recIDs_dict_keys:
for s in recIDs_dict[k]:
recIDs_out.append(s)
# ascending or descending?
if sort_order == 'a':
recIDs_out.reverse()
# okay, we are done
# return only up to the maximum that we need to sort
if len(recIDs_out) != len(recIDs):
dummy, irec_max = get_interval_for_records_to_sort(len(recIDs_out), jrec, rg)
index_min = max(len(recIDs_out) - irec_max, 0) #just to be sure that the min index is not negative
return recIDs_out[index_min:]
else:
# good, no sort needed
return recIDs[index_min:]
def get_interval_for_records_to_sort(nb_found, jrec=None, rg=None):
"""calculates in which interval should the sorted records be
a value of 'rg=-9999' means to print all records: to be used with care."""
if not jrec:
jrec = 1
if not rg:
#return all
return jrec-1, nb_found
if rg == -9999: # print all records
rg = nb_found
else:
rg = abs(rg)
if jrec < 1: # sanity checks
jrec = 1
if jrec > nb_found:
jrec = max(nb_found-rg+1, 1)
# will sort records from irec_min to irec_max excluded
irec_min = jrec - 1
irec_max = irec_min + rg
if irec_min < 0:
irec_min = 0
if irec_max > nb_found:
irec_max = nb_found
return irec_min, irec_max
def print_records(req, recIDs, jrec=1, rg=CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS, format='hb', ot='', ln=CFG_SITE_LANG, relevances=[], relevances_prologue="(", relevances_epilogue="%%)", decompress=zlib.decompress, search_pattern='', print_records_prologue_p=True, print_records_epilogue_p=True, verbose=0, tab='', sf='', so='d', sp='', rm=''):
"""
Prints list of records 'recIDs' formatted according to 'format' in
groups of 'rg' starting from 'jrec'.
Assumes that the input list 'recIDs' is sorted in reverse order,
so it counts records from tail to head.
A value of 'rg=-9999' means to print all records: to be used with care.
Print also list of RELEVANCES for each record (if defined), in
between RELEVANCE_PROLOGUE and RELEVANCE_EPILOGUE.
Print prologue and/or epilogue specific to 'format' if
'print_records_prologue_p' and/or print_records_epilogue_p' are
True.
'sf' is sort field and 'rm' is ranking method that are passed here
only for proper linking purposes: e.g. when a certain ranking
method or a certain sort field was selected, keep it selected in
any dynamic search links that may be printed.
"""
# load the right message language
_ = gettext_set_language(ln)
# sanity checking:
if req is None:
return
# get user_info (for formatting based on user)
if isinstance(req, cStringIO.OutputType):
user_info = {}
else:
user_info = collect_user_info(req)
if len(recIDs):
nb_found = len(recIDs)
if rg == -9999: # print all records
rg = nb_found
else:
rg = abs(rg)
if jrec < 1: # sanity checks
jrec = 1
if jrec > nb_found:
jrec = max(nb_found-rg+1, 1)
# will print records from irec_max to irec_min excluded:
irec_max = nb_found - jrec
irec_min = nb_found - jrec - rg
if irec_min < 0:
irec_min = -1
if irec_max >= nb_found:
irec_max = nb_found - 1
#req.write("%s:%d-%d" % (recIDs, irec_min, irec_max))
if format.startswith('x'):
# print header if needed
if print_records_prologue_p:
print_records_prologue(req, format)
# print records
recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)]
format_records(recIDs_to_print,
format,
ln=ln,
search_pattern=search_pattern,
record_separator="\n",
user_info=user_info,
req=req)
# print footer if needed
if print_records_epilogue_p:
print_records_epilogue(req, format)
elif format.startswith('t') or str(format[0:3]).isdigit():
# we are doing plain text output:
for irec in range(irec_max, irec_min, -1):
x = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm)
req.write(x)
if x:
req.write('\n')
elif format == 'excel':
recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)]
create_excel(recIDs=recIDs_to_print, req=req, ln=ln, ot=ot)
else:
# we are doing HTML output:
if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"):
# portfolio and on-the-fly formats:
for irec in range(irec_max, irec_min, -1):
req.write(print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm))
elif format.startswith("hb"):
# HTML brief format:
display_add_to_basket = True
if user_info:
if user_info['email'] == 'guest':
if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS > 4:
display_add_to_basket = False
else:
if not user_info['precached_usebaskets']:
display_add_to_basket = False
req.write(websearch_templates.tmpl_record_format_htmlbrief_header(
ln = ln))
for irec in range(irec_max, irec_min, -1):
row_number = jrec+irec_max-irec
recid = recIDs[irec]
if relevances and relevances[irec]:
relevance = relevances[irec]
else:
relevance = ''
record = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm)
req.write(websearch_templates.tmpl_record_format_htmlbrief_body(
ln = ln,
recid = recid,
row_number = row_number,
relevance = relevance,
record = record,
relevances_prologue = relevances_prologue,
relevances_epilogue = relevances_epilogue,
display_add_to_basket = display_add_to_basket
))
req.write(websearch_templates.tmpl_record_format_htmlbrief_footer(
ln = ln,
display_add_to_basket = display_add_to_basket))
elif format.startswith("hd"):
# HTML detailed format:
for irec in range(irec_max, irec_min, -1):
if record_exists(recIDs[irec]) == -1:
write_warning(_("The record has been deleted."), req=req)
merged_recid = get_merged_recid(recIDs[irec])
if merged_recid:
write_warning(_("The record %d replaces it." % merged_recid), req=req)
continue
unordered_tabs = get_detailed_page_tabs(get_colID(guess_primary_collection_of_a_record(recIDs[irec])),
recIDs[irec], ln=ln)
ordered_tabs_id = [(tab_id, values['order']) for (tab_id, values) in unordered_tabs.iteritems()]
ordered_tabs_id.sort(lambda x, y: cmp(x[1], y[1]))
link_ln = ''
if ln != CFG_SITE_LANG:
link_ln = '?ln=%s' % ln
recid = recIDs[irec]
recid_to_display = recid # Record ID used to build the URL.
if CFG_WEBSEARCH_USE_ALEPH_SYSNOS:
try:
recid_to_display = get_fieldvalues(recid,
CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG)[0]
except IndexError:
# No external sysno is available, keep using
# internal recid.
pass
tabs = [(unordered_tabs[tab_id]['label'], \
'%s/%s/%s/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, recid_to_display, tab_id, link_ln), \
tab_id == tab,
unordered_tabs[tab_id]['enabled']) \
for (tab_id, order) in ordered_tabs_id
if unordered_tabs[tab_id]['visible'] == True]
tabs_counts = get_detailed_page_tabs_counts(recid)
citedbynum = tabs_counts['Citations']
references = tabs_counts['References']
discussions = tabs_counts['Discussions']
# load content
if tab == 'usage':
req.write(webstyle_templates.detailed_record_container_top(recIDs[irec],
tabs,
ln,
citationnum=citedbynum,
referencenum=references,
discussionnum=discussions))
r = calculate_reading_similarity_list(recIDs[irec], "downloads")
downloadsimilarity = None
downloadhistory = None
#if r:
# downloadsimilarity = r
if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS:
downloadhistory = create_download_history_graph_and_box(recIDs[irec], ln)
r = calculate_reading_similarity_list(recIDs[irec], "pageviews")
viewsimilarity = None
if r: viewsimilarity = r
content = websearch_templates.tmpl_detailed_record_statistics(recIDs[irec],
ln,
downloadsimilarity=downloadsimilarity,
downloadhistory=downloadhistory,
viewsimilarity=viewsimilarity)
req.write(content)
req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec],
tabs,
ln))
elif tab == 'citations':
recid = recIDs[irec]
req.write(webstyle_templates.detailed_record_container_top(recid,
tabs,
ln,
citationnum=citedbynum,
referencenum=references,
discussionnum=discussions))
req.write(websearch_templates.tmpl_detailed_record_citations_prologue(recid, ln))
# Citing
citinglist = calculate_cited_by_list(recid)
req.write(websearch_templates.tmpl_detailed_record_citations_citing_list(recid,
ln,
citinglist,
sf=sf,
so=so,
sp=sp,
rm=rm))
# Self-cited
selfcited = get_self_cited_by(recid)
req.write(websearch_templates.tmpl_detailed_record_citations_self_cited(recid,
ln, selfcited=selfcited, citinglist=citinglist))
# Co-cited
s = calculate_co_cited_with_list(recid)
cociting = None
if s:
cociting = s
req.write(websearch_templates.tmpl_detailed_record_citations_co_citing(recid,
ln,
cociting=cociting))
# Citation history, if needed
citationhistory = None
if citinglist:
citationhistory = create_citation_history_graph_and_box(recid, ln)
#debug
if verbose > 3:
write_warning("Citation graph debug: " + \
str(len(citationhistory)), req=req)
req.write(websearch_templates.tmpl_detailed_record_citations_citation_history(recid, ln, citationhistory))
req.write(websearch_templates.tmpl_detailed_record_citations_epilogue(recid, ln))
req.write(webstyle_templates.detailed_record_container_bottom(recid,
tabs,
ln))
elif tab == 'references':
req.write(webstyle_templates.detailed_record_container_top(recIDs[irec],
tabs,
ln,
citationnum=citedbynum,
referencenum=references,
discussionnum=discussions))
req.write(format_record(recIDs[irec], 'HDREF', ln=ln, user_info=user_info, verbose=verbose))
req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec],
tabs,
ln))
elif tab == 'keywords':
from invenio import bibclassify_webinterface
recid = recIDs[irec]
bibclassify_webinterface.main_page(req, recid, tabs, ln, webstyle_templates)
elif tab == 'plots':
req.write(webstyle_templates.detailed_record_container_top(recIDs[irec],
tabs,
ln))
content = websearch_templates.tmpl_record_plots(recID=recIDs[irec],
ln=ln)
req.write(content)
req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec],
tabs,
ln))
else:
# Metadata tab
req.write(webstyle_templates.detailed_record_container_top(recIDs[irec],
tabs,
ln,
show_short_rec_p=False,
citationnum=citedbynum, referencenum=references,
discussionnum=discussions))
creationdate = None
modificationdate = None
if record_exists(recIDs[irec]) == 1:
creationdate = get_creation_date(recIDs[irec])
modificationdate = get_modification_date(recIDs[irec])
content = print_record(recIDs[irec], format, ot, ln,
search_pattern=search_pattern,
user_info=user_info, verbose=verbose,
sf=sf, so=so, sp=sp, rm=rm)
content = websearch_templates.tmpl_detailed_record_metadata(
recID = recIDs[irec],
ln = ln,
format = format,
creationdate = creationdate,
modificationdate = modificationdate,
content = content)
# display of the next-hit/previous-hit/back-to-search links
# on the detailed record pages
content += websearch_templates.tmpl_display_back_to_search(req,
recIDs[irec],
ln)
req.write(content)
req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec],
tabs,
ln,
creationdate=creationdate,
modificationdate=modificationdate,
show_short_rec_p=False))
if len(tabs) > 0:
# Add the mini box at bottom of the page
if CFG_WEBCOMMENT_ALLOW_REVIEWS:
from invenio.webcomment import get_mini_reviews
reviews = get_mini_reviews(recid = recIDs[irec], ln=ln)
else:
reviews = ''
actions = format_record(recIDs[irec], 'HDACT', ln=ln, user_info=user_info, verbose=verbose)
files = format_record(recIDs[irec], 'HDFILE', ln=ln, user_info=user_info, verbose=verbose)
req.write(webstyle_templates.detailed_record_mini_panel(recIDs[irec],
ln,
format,
files=files,
reviews=reviews,
actions=actions))
else:
# Other formats
for irec in range(irec_max, irec_min, -1):
req.write(print_record(recIDs[irec], format, ot, ln,
search_pattern=search_pattern,
user_info=user_info, verbose=verbose,
sf=sf, so=so, sp=sp, rm=rm))
else:
write_warning(_("Use different search terms."), req=req)
def print_records_prologue(req, format, cc=None):
"""
Print the appropriate prologue for list of records in the given
format.
"""
prologue = "" # no prologue needed for HTML or Text formats
if format.startswith('xm'):
prologue = websearch_templates.tmpl_xml_marc_prologue()
elif format.startswith('xn'):
prologue = websearch_templates.tmpl_xml_nlm_prologue()
elif format.startswith('xw'):
prologue = websearch_templates.tmpl_xml_refworks_prologue()
elif format.startswith('xr'):
prologue = websearch_templates.tmpl_xml_rss_prologue(cc=cc)
elif format.startswith('xe8x'):
prologue = websearch_templates.tmpl_xml_endnote_8x_prologue()
elif format.startswith('xe'):
prologue = websearch_templates.tmpl_xml_endnote_prologue()
elif format.startswith('xo'):
prologue = websearch_templates.tmpl_xml_mods_prologue()
elif format.startswith('xp'):
prologue = websearch_templates.tmpl_xml_podcast_prologue(cc=cc)
elif format.startswith('x'):
prologue = websearch_templates.tmpl_xml_default_prologue()
req.write(prologue)
def print_records_epilogue(req, format):
"""
Print the appropriate epilogue for list of records in the given
format.
"""
epilogue = "" # no epilogue needed for HTML or Text formats
if format.startswith('xm'):
epilogue = websearch_templates.tmpl_xml_marc_epilogue()
elif format.startswith('xn'):
epilogue = websearch_templates.tmpl_xml_nlm_epilogue()
elif format.startswith('xw'):
epilogue = websearch_templates.tmpl_xml_refworks_epilogue()
elif format.startswith('xr'):
epilogue = websearch_templates.tmpl_xml_rss_epilogue()
elif format.startswith('xe8x'):
epilogue = websearch_templates.tmpl_xml_endnote_8x_epilogue()
elif format.startswith('xe'):
epilogue = websearch_templates.tmpl_xml_endnote_epilogue()
elif format.startswith('xo'):
epilogue = websearch_templates.tmpl_xml_mods_epilogue()
elif format.startswith('xp'):
epilogue = websearch_templates.tmpl_xml_podcast_epilogue()
elif format.startswith('x'):
epilogue = websearch_templates.tmpl_xml_default_epilogue()
req.write(epilogue)
def get_record(recid):
"""Directly the record object corresponding to the recid."""
if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE:
value = run_sql("SELECT value FROM bibfmt WHERE id_bibrec=%s AND FORMAT='recstruct'", (recid, ))
if value:
try:
return deserialize_via_marshal(value[0][0])
except:
### In case of corruption, let's rebuild it!
pass
return create_record(print_record(recid, 'xm'))[0]
def print_record(recID, format='hb', ot='', ln=CFG_SITE_LANG, decompress=zlib.decompress,
search_pattern=None, user_info=None, verbose=0, sf='', so='d', sp='', rm=''):
"""
Prints record 'recID' formatted according to 'format'.
'sf' is sort field and 'rm' is ranking method that are passed here
only for proper linking purposes: e.g. when a certain ranking
method or a certain sort field was selected, keep it selected in
any dynamic search links that may be printed.
"""
if format == 'recstruct':
return get_record(recID)
_ = gettext_set_language(ln)
display_claim_this_paper = False
try:
display_claim_this_paper = user_info["precached_viewclaimlink"]
except (KeyError, TypeError):
display_claim_this_paper = False
#check from user information if the user has the right to see hidden fields/tags in the
#records as well
can_see_hidden = False
if user_info:
can_see_hidden = user_info.get('precached_canseehiddenmarctags', False)
out = ""
# sanity check:
record_exist_p = record_exists(recID)
if record_exist_p == 0: # doesn't exist
return out
# New Python BibFormat procedure for formatting
# Old procedure follows further below
# We must still check some special formats, but these
# should disappear when BibFormat improves.
if not (CFG_BIBFORMAT_USE_OLD_BIBFORMAT \
or format.lower().startswith('t') \
or format.lower().startswith('hm') \
or str(format[0:3]).isdigit() \
or ot):
# Unspecified format is hd
if format == '':
format = 'hd'
if record_exist_p == -1 and get_output_format_content_type(format) == 'text/html':
# HTML output displays a default value for deleted records.
# Other format have to deal with it.
out += _("The record has been deleted.")
# was record deleted-but-merged ?
merged_recid = get_merged_recid(recID)
if merged_recid:
out += ' ' + _("The record %d replaces it." % merged_recid)
else:
out += call_bibformat(recID, format, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
# at the end of HTML brief mode, print the "Detailed record" functionality:
if format.lower().startswith('hb') and \
format.lower() != 'hb_p':
out += websearch_templates.tmpl_print_record_brief_links(ln=ln,
recID=recID,
sf=sf,
so=so,
sp=sp,
rm=rm,
display_claim_link=display_claim_this_paper)
return out
# Old PHP BibFormat procedure for formatting
# print record opening tags, if needed:
if format == "marcxml" or format == "oai_dc":
out += " " + cgi.escape(get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"], can_see_hidden)) + "
"
else:
out += "\n" + cgi.escape(get_fieldvalues_alephseq_like(recID, ot, can_see_hidden)) + "
"
elif format.startswith("h") and ot:
## user directly asked for some tags to be displayed only
if record_exist_p == -1:
out += "\n" + get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"], can_see_hidden) + "
"
else:
out += "\n" + get_fieldvalues_alephseq_like(recID, ot, can_see_hidden) + "
"
elif format == "hd":
# HTML detailed format
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
# look for detailed format existence:
query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s"
res = run_sql(query, (recID, format), 1)
if res:
# record 'recID' is formatted in 'format', so print it
out += "%s" % decompress(res[0][0])
else:
# record 'recID' is not formatted in 'format', so try to call BibFormat on the fly or use default format:
out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
if out_record_in_format:
out += out_record_in_format
else:
out += websearch_templates.tmpl_print_record_detailed(
ln = ln,
recID = recID,
)
elif format.startswith("hb_") or format.startswith("hd_"):
# underscore means that HTML brief/detailed formats should be called on-the-fly; suitable for testing formats
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
out += call_bibformat(recID, format, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
elif format.startswith("hx"):
# BibTeX format, called on the fly:
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
out += call_bibformat(recID, format, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
elif format.startswith("hs"):
# for citation/download similarity navigation links:
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
out += '' % websearch_templates.build_search_url(recid=recID, ln=ln)
# firstly, title:
titles = get_fieldvalues(recID, "245__a")
if titles:
for title in titles:
out += "%s" % title
else:
# usual title not found, try conference title:
titles = get_fieldvalues(recID, "111__a")
if titles:
for title in titles:
out += "%s" % title
else:
# just print record ID:
out += "%s %d" % (get_field_i18nname("record ID", ln, False), recID)
out += ""
# secondly, authors:
authors = get_fieldvalues(recID, "100__a") + get_fieldvalues(recID, "700__a")
if authors:
out += " - %s" % authors[0]
if len(authors) > 1:
out += " et al"
# thirdly publication info:
publinfos = get_fieldvalues(recID, "773__s")
if not publinfos:
publinfos = get_fieldvalues(recID, "909C4s")
if not publinfos:
publinfos = get_fieldvalues(recID, "037__a")
if not publinfos:
publinfos = get_fieldvalues(recID, "088__a")
if publinfos:
out += " - %s" % publinfos[0]
else:
# fourthly publication year (if not publication info):
years = get_fieldvalues(recID, "773__y")
if not years:
years = get_fieldvalues(recID, "909C4y")
if not years:
years = get_fieldvalues(recID, "260__c")
if years:
out += " (%s)" % years[0]
else:
# HTML brief format by default
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s"
res = run_sql(query, (recID, format))
if res:
# record 'recID' is formatted in 'format', so print it
out += "%s" % decompress(res[0][0])
else:
# record 'recID' is not formatted in 'format', so try to call BibFormat on the fly: or use default format:
if CFG_WEBSEARCH_CALL_BIBFORMAT:
out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
if out_record_in_format:
out += out_record_in_format
else:
out += websearch_templates.tmpl_print_record_brief(
ln = ln,
recID = recID,
)
else:
out += websearch_templates.tmpl_print_record_brief(
ln = ln,
recID = recID,
)
# at the end of HTML brief mode, print the "Detailed record" functionality:
if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"):
pass # do nothing for portfolio and on-the-fly formats
else:
out += websearch_templates.tmpl_print_record_brief_links(ln=ln,
recID=recID,
sf=sf,
so=so,
sp=sp,
rm=rm,
display_claim_link=display_claim_this_paper)
# print record closing tags, if needed:
if format == "marcxml" or format == "oai_dc":
out += " Search Cache
"
# clear cache if requested:
if action == "clear":
search_results_cache.clear()
req.write(out)
# show collection reclist cache:
out = "Collection reclist cache
"
out += "- collection table last updated: %s" % get_table_update_time('collection')
out += "
- reclist cache timestamp: %s" % collection_reclist_cache.timestamp
out += "
- reclist cache contents:"
out += ""
for coll in collection_reclist_cache.cache.keys():
if collection_reclist_cache.cache[coll]:
out += "%s (%d)
"
req.write(out)
# show search results cache:
out = "
" % (coll, len(collection_reclist_cache.cache[coll]))
out += "Search Cache
"
out += "- search cache usage: %d queries cached (max. ~%d)" % \
(len(search_results_cache.cache), CFG_WEBSEARCH_SEARCH_CACHE_SIZE)
if len(search_results_cache.cache):
out += "
- search cache contents:"
out += ""
for query, hitset in search_results_cache.cache.items():
out += "
"
req.write(out)
# show field i18nname cache:
out = "
%s ... %s" % (query, hitset)
out += """Field I18N names cache
"
out += "- fieldname table last updated: %s" % get_table_update_time('fieldname')
out += "
- i18nname cache timestamp: %s" % field_i18nname_cache.timestamp
out += "
- i18nname cache contents:"
out += ""
for field in field_i18nname_cache.cache.keys():
for ln in field_i18nname_cache.cache[field].keys():
out += "%s, %s = %s
"
req.write(out)
# show collection i18nname cache:
out = "
" % (field, ln, field_i18nname_cache.cache[field][ln])
out += "Collection I18N names cache
"
out += "- collectionname table last updated: %s" % get_table_update_time('collectionname')
out += "
- i18nname cache timestamp: %s" % collection_i18nname_cache.timestamp
out += "
- i18nname cache contents:"
out += ""
for coll in collection_i18nname_cache.cache.keys():
for ln in collection_i18nname_cache.cache[coll].keys():
out += "%s, %s = %s
"
req.write(out)
req.write("")
return "\n"
def perform_request_log(req, date=""):
"""Display search log information for given date."""
req.content_type = "text/html"
req.send_http_header()
req.write("")
req.write("
" % (coll, ln, collection_i18nname_cache.cache[coll][ln])
out += "Search Log
")
if date: # case A: display stats for a day
yyyymmdd = string.atoi(date)
req.write("""")
req.write("
")
else: # case B: display summary stats per day
yyyymm01 = int(time.strftime("%Y%m01", time.localtime()))
yyyymmdd = int(time.strftime("%Y%m%d", time.localtime()))
req.write(""" " % ("No.", "Time", "Pattern", "Field", "Collection", "Number of Hits"))
# read file:
p = os.popen("grep ^%d %s/search.log" % (yyyymmdd, CFG_LOGDIR), 'r')
lines = p.readlines()
p.close()
# process lines:
i = 0
for line in lines:
try:
datetime, dummy_aas, p, f, c, nbhits = string.split(line,"#")
i += 1
req.write("%s %s %s %s %s %s " \
% (i, datetime[8:10], datetime[10:12], datetime[12:], p, f, c, nbhits))
except:
pass # ignore eventual wrong log lines
req.write("#%d %s:%s:%s %s %s %s %s """)
req.write("
")
req.write("")
return "\n"
def get_all_field_values(tag):
"""
Return all existing values stored for a given tag.
@param tag: the full tag, e.g. 909C0b
@type tag: string
@return: the list of values
@rtype: list of strings
"""
table = 'bib%2dx' % int(tag[:2])
return [row[0] for row in run_sql("SELECT DISTINCT(value) FROM %s WHERE tag=%%s" % table, (tag, ))]
def get_most_popular_field_values(recids, tags, exclude_values=None, count_repetitive_values=True):
"""
Analyze RECIDS and look for TAGS and return most popular values
and the frequency with which they occur sorted according to
descending frequency.
If a value is found in EXCLUDE_VALUES, then do not count it.
If COUNT_REPETITIVE_VALUES is True, then we count every occurrence
of value in the tags. If False, then we count the value only once
regardless of the number of times it may appear in a record.
(But, if the same value occurs in another record, we count it, of
course.)
Example:
>>> get_most_popular_field_values(range(11,20), '980__a')
(('PREPRINT', 10), ('THESIS', 7), ...)
>>> get_most_popular_field_values(range(11,20), ('100__a', '700__a'))
(('Ellis, J', 10), ('Ellis, N', 7), ...)
>>> get_most_popular_field_values(range(11,20), ('100__a', '700__a'), ('Ellis, J'))
(('Ellis, N', 7), ...)
"""
def _get_most_popular_field_values_helper_sorter(val1, val2):
"Compare VAL1 and VAL2 according to, firstly, frequency, then secondly, alphabetically."
compared_via_frequencies = cmp(valuefreqdict[val2], valuefreqdict[val1])
if compared_via_frequencies == 0:
return cmp(val1.lower(), val2.lower())
else:
return compared_via_frequencies
valuefreqdict = {}
## sanity check:
if not exclude_values:
exclude_values = []
if isinstance(tags, str):
tags = (tags,)
## find values to count:
vals_to_count = []
displaytmp = {}
if count_repetitive_values:
# counting technique A: can look up many records at once: (very fast)
for tag in tags:
vals_to_count.extend(get_fieldvalues(recids, tag, sort=False))
else:
# counting technique B: must count record-by-record: (slow)
for recid in recids:
vals_in_rec = []
for tag in tags:
for val in get_fieldvalues(recid, tag, False):
vals_in_rec.append(val)
# do not count repetitive values within this record
# (even across various tags, so need to unify again):
dtmp = {}
for val in vals_in_rec:
dtmp[val.lower()] = 1
displaytmp[val.lower()] = val
vals_in_rec = dtmp.keys()
vals_to_count.extend(vals_in_rec)
## are we to exclude some of found values?
for val in vals_to_count:
if val not in exclude_values:
if valuefreqdict.has_key(val):
valuefreqdict[val] += 1
else:
valuefreqdict[val] = 1
## sort by descending frequency of values:
out = ()
vals = valuefreqdict.keys()
vals.sort(_get_most_popular_field_values_helper_sorter)
for val in vals:
tmpdisplv = ''
if displaytmp.has_key(val):
tmpdisplv = displaytmp[val]
else:
tmpdisplv = val
out += (tmpdisplv, valuefreqdict[val]),
return out
def profile(p="", f="", c=CFG_SITE_NAME):
"""Profile search time."""
import profile
import pstats
profile.run("perform_request_search(p='%s',f='%s', c='%s')" % (p, f, c), "perform_request_search_profile")
p = pstats.Stats("perform_request_search_profile")
p.strip_dirs().sort_stats("cumulative").print_stats()
return 0
diff --git a/modules/websearch/lib/websearch_webcoll.py b/modules/websearch/lib/websearch_webcoll.py
index 3660aadce..e9a355b26 100644
--- a/modules/websearch/lib/websearch_webcoll.py
+++ b/modules/websearch/lib/websearch_webcoll.py
@@ -1,1079 +1,1079 @@
## This file is part of Invenio.
## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Create Invenio collection cache."""
__revision__ = "$Id$"
import calendar
import copy
import sys
import cgi
import re
import os
import string
import time
from invenio.config import \
CFG_CERN_SITE, \
CFG_WEBSEARCH_INSTANT_BROWSE, \
CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, \
CFG_WEBSEARCH_I18N_LATEST_ADDITIONS, \
CFG_CACHEDIR, \
CFG_SITE_LANG, \
CFG_SITE_NAME, \
CFG_SITE_LANGS, \
CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES, \
CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE
from invenio.messages import gettext_set_language, language_list_long
from invenio.search_engine import search_pattern_parenthesised, get_creation_date, get_field_i18nname, collection_restricted_p, sort_records
from invenio.dbquery import run_sql, Error, get_table_update_time
from invenio.bibrank_record_sorter import get_bibrank_methods
from invenio.dateutils import convert_datestruct_to_dategui
from invenio.bibformat import format_record
from invenio.shellutils import mymkdir
from invenio.intbitset import intbitset
from invenio.websearch_external_collections import \
external_collection_load_states, \
dico_collection_external_searches, \
external_collection_sort_engine_by_name
from invenio.bibtask import task_init, task_get_option, task_set_option, \
write_message, task_has_option, task_update_progress, \
task_sleep_now_if_required
import invenio.template
websearch_templates = invenio.template.load('websearch')
from invenio.websearch_external_collections_searcher import external_collections_dictionary
from invenio.websearch_external_collections_config import CFG_EXTERNAL_COLLECTION_TIMEOUT
from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS
## global vars
COLLECTION_HOUSE = {} # will hold collections we treat in this run of the program; a dict of {collname2, collobject1}, ...
# CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE -- cache timestamp
# tolerance (in seconds), to account for the fact that an admin might
# accidentally happen to edit the collection definitions at exactly
# the same second when some webcoll process was about to be started.
# In order to be safe, let's put an exaggerated timestamp tolerance
# value such as 20 seconds:
CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE = 20
# CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE -- location of the cache
# timestamp file:
CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE = "%s/collections/last_updated" % CFG_CACHEDIR
# CFG_CACHE_LAST_FAST_UPDATED_TIMESTAMP_FILE -- location of the cache
# timestamp file usef when running webcoll in the fast-mode.
CFG_CACHE_LAST_FAST_UPDATED_TIMESTAMP_FILE = "%s/collections/last_fast_updated" % CFG_CACHEDIR
def get_collection(colname):
"""Return collection object from the collection house for given colname.
If does not exist, then create it."""
if not COLLECTION_HOUSE.has_key(colname):
colobject = Collection(colname)
COLLECTION_HOUSE[colname] = colobject
return COLLECTION_HOUSE[colname]
## auxiliary functions:
def is_selected(var, fld):
"Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes."
if var == fld:
return ' selected="selected"'
else:
return ""
def get_field(recID, tag):
"Gets list of field 'tag' for the record with 'recID' system number."
out = []
digit = tag[0:2]
bx = "bib%sx" % digit
bibx = "bibrec_bib%sx" % digit
query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag='%s'" \
% (bx, bibx, recID, tag)
res = run_sql(query)
for row in res:
out.append(row[0])
return out
def check_nbrecs_for_all_external_collections():
"""Check if any of the external collections have changed their total number of records, aka nbrecs.
Return True if any of the total numbers of records have changed and False if they're all the same."""
res = run_sql("SELECT name FROM collection WHERE dbquery LIKE 'hostedcollection:%';")
for row in res:
coll_name = row[0]
if (get_collection(coll_name)).check_nbrecs_for_external_collection():
write_message("External collection %s found updated." % coll_name, verbose=6)
return True
write_message("All external collections are up to date.", verbose=6)
return False
class Collection:
"Holds the information on collections (id,name,dbquery)."
def __init__(self, name=""):
"Creates collection instance by querying the DB configuration database about 'name'."
self.calculate_reclist_run_already = 0 # to speed things up without much refactoring
self.update_reclist_run_already = 0 # to speed things up without much refactoring
self.reclist_with_nonpublic_subcolls = intbitset()
# used to store the temporary result of the calculation of nbrecs of an external collection
self.nbrecs_tmp = None
if not name:
self.name = CFG_SITE_NAME # by default we are working on the home page
self.id = 1
self.dbquery = None
self.nbrecs = None
self.reclist = intbitset()
else:
self.name = name
try:
res = run_sql("""SELECT id,name,dbquery,nbrecs,reclist FROM collection
WHERE name=%s""", (name,))
if res:
self.id = res[0][0]
self.name = res[0][1]
self.dbquery = res[0][2]
self.nbrecs = res[0][3]
try:
self.reclist = intbitset(res[0][4])
except:
self.reclist = intbitset()
else: # collection does not exist!
self.id = None
self.dbquery = None
self.nbrecs = None
self.reclist = intbitset()
except Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit(1)
def get_example_search_queries(self):
"""Returns list of sample search queries for this collection.
"""
res = run_sql("""SELECT example.body FROM example
LEFT JOIN collection_example on example.id=collection_example.id_example
WHERE collection_example.id_collection=%s ORDER BY collection_example.score""", (self.id,))
return [query[0] for query in res]
def get_name(self, ln=CFG_SITE_LANG, name_type="ln", prolog="", epilog="", prolog_suffix=" ", epilog_suffix=""):
"""Return nicely formatted collection name for language LN.
The NAME_TYPE may be 'ln' (=long name), 'sn' (=short name), etc."""
out = prolog
i18name = ""
res = run_sql("SELECT value FROM collectionname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, name_type))
try:
i18name += res[0][0]
except IndexError:
pass
if i18name:
out += i18name
else:
out += self.name
out += epilog
return out
def get_ancestors(self):
"Returns list of ancestors of the current collection."
ancestors = []
ancestors_ids = intbitset()
id_son = self.id
while 1:
query = "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c "\
"WHERE cc.id_son=%d AND c.id=cc.id_dad" % int(id_son)
res = run_sql(query, None, 1)
if res:
col_ancestor = get_collection(res[0][1])
# looking for loops
if self.id in ancestors_ids:
write_message("Loop found in collection %s" % self.name, stream=sys.stderr)
raise OverflowError("Loop found in collection %s" % self.name)
else:
ancestors.append(col_ancestor)
ancestors_ids.add(col_ancestor.id)
id_son = res[0][0]
else:
break
ancestors.reverse()
return ancestors
def restricted_p(self):
"""Predicate to test if the collection is restricted or not. Return the contect of the
`restrited' column of the collection table (typically Apache group). Otherwise return
None if the collection is public."""
if collection_restricted_p(self.name):
return 1
return None
def get_sons(self, type='r'):
"Returns list of direct sons of type 'type' for the current collection."
sons = []
id_dad = self.id
query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\
"WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC, c.name ASC" % (int(id_dad), type)
res = run_sql(query)
for row in res:
sons.append(get_collection(row[1]))
return sons
def get_descendants(self, type='r'):
"Returns list of all descendants of type 'type' for the current collection."
descendants = []
descendant_ids = intbitset()
id_dad = self.id
query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\
"WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC" % (int(id_dad), type)
res = run_sql(query)
for row in res:
col_desc = get_collection(row[1])
# looking for loops
if self.id in descendant_ids:
write_message("Loop found in collection %s" % self.name, stream=sys.stderr)
raise OverflowError("Loop found in collection %s" % self.name)
else:
descendants.append(col_desc)
descendant_ids.add(col_desc.id)
tmp_descendants = col_desc.get_descendants()
for descendant in tmp_descendants:
descendant_ids.add(descendant.id)
descendants += tmp_descendants
return descendants
def write_cache_file(self, filename='', filebody=''):
"Write a file inside collection cache."
# open file:
dirname = "%s/collections/%d" % (CFG_CACHEDIR, self.id)
mymkdir(dirname)
fullfilename = dirname + "/%s.html" % filename
try:
os.umask(022)
f = open(fullfilename, "w")
except IOError, v:
try:
(code, message) = v
except:
code = 0
message = v
print "I/O Error: " + str(message) + " (" + str(code) + ")"
sys.exit(1)
# print user info:
write_message("... creating %s" % fullfilename, verbose=6)
sys.stdout.flush()
# print page body:
f.write(filebody)
# close file:
f.close()
def update_webpage_cache(self):
"""Create collection page header, navtrail, body (including left and right stripes) and footer, and
call write_cache_file() afterwards to update the collection webpage cache."""
## precalculate latest additions for non-aggregate
## collections (the info is ln and as independent)
if self.dbquery and not CFG_WEBSEARCH_I18N_LATEST_ADDITIONS:
self.create_latest_additions_info()
## do this for each language:
for lang, lang_fullname in language_list_long():
# but only if some concrete language was not chosen only:
if lang in task_get_option("language", [lang]):
if self.dbquery and CFG_WEBSEARCH_I18N_LATEST_ADDITIONS:
self.create_latest_additions_info(ln=lang)
# load the right message language
_ = gettext_set_language(lang)
## first, update navtrail:
for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES:
self.write_cache_file("navtrail-as=%s-ln=%s" % (aas, lang),
self.create_navtrail_links(aas, lang))
## second, update page body:
for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: # do light, simple and advanced search pages:
body = websearch_templates.tmpl_webcoll_body(
ln=lang, collection=self.name,
te_portalbox = self.create_portalbox(lang, 'te'),
searchfor = self.create_searchfor(aas, lang),
np_portalbox = self.create_portalbox(lang, 'np'),
narrowsearch = self.create_narrowsearch(aas, lang, 'r'),
focuson = self.create_narrowsearch(aas, lang, "v") + \
self.create_external_collections_box(lang),
instantbrowse = self.create_instant_browse(aas=aas, ln=lang),
ne_portalbox = self.create_portalbox(lang, 'ne')
)
self.write_cache_file("body-as=%s-ln=%s" % (aas, lang), body)
## third, write portalboxes:
self.write_cache_file("portalbox-tp-ln=%s" % lang, self.create_portalbox(lang, "tp"))
self.write_cache_file("portalbox-te-ln=%s" % lang, self.create_portalbox(lang, "te"))
self.write_cache_file("portalbox-lt-ln=%s" % lang, self.create_portalbox(lang, "lt"))
self.write_cache_file("portalbox-rt-ln=%s" % lang, self.create_portalbox(lang, "rt"))
## fourth, write 'last updated' information:
self.write_cache_file("last-updated-ln=%s" % lang,
convert_datestruct_to_dategui(time.localtime(),
ln=lang))
return
def create_navtrail_links(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG):
"""Creates navigation trail links, i.e. links to collection
ancestors (except Home collection). If aas==1, then links to
Advanced Search interfaces; otherwise Simple Search.
"""
dads = []
for dad in self.get_ancestors():
if dad.name != CFG_SITE_NAME: # exclude Home collection
dads.append((dad.name, dad.get_name(ln)))
return websearch_templates.tmpl_navtrail_links(
aas=aas, ln=ln, dads=dads)
def create_portalbox(self, lang=CFG_SITE_LANG, position="rt"):
"""Creates portalboxes of language CFG_SITE_LANG of the position POSITION by consulting DB configuration database.
The position may be: 'lt'='left top', 'rt'='right top', etc."""
out = ""
query = "SELECT p.title,p.body FROM portalbox AS p, collection_portalbox AS cp "\
" WHERE cp.id_collection=%d AND p.id=cp.id_portalbox AND cp.ln='%s' AND cp.position='%s' "\
" ORDER BY cp.score DESC" % (self.id, lang, position)
res = run_sql(query)
for row in res:
title, body = row[0], row[1]
if title:
out += websearch_templates.tmpl_portalbox(title = title,
body = body)
else:
# no title specified, so print body ``as is'' only:
out += body
return out
def create_narrowsearch(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG, type="r"):
"""Creates list of collection descendants of type 'type' under title 'title'.
If aas==1, then links to Advanced Search interfaces; otherwise Simple Search.
Suitable for 'Narrow search' and 'Focus on' boxes."""
# get list of sons and analyse it
sons = self.get_sons(type)
if not sons:
return ''
# get descendents
descendants = self.get_descendants(type)
grandsons = []
if CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS:
# load grandsons for each son
for son in sons:
grandsons.append(son.get_sons())
# return ""
return websearch_templates.tmpl_narrowsearch(
aas = aas,
ln = ln,
type = type,
father = self,
has_grandchildren = len(descendants)>len(sons),
sons = sons,
display_grandsons = CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS,
grandsons = grandsons
)
def create_external_collections_box(self, ln=CFG_SITE_LANG):
external_collection_load_states()
if not dico_collection_external_searches.has_key(self.id):
return ""
engines_list = external_collection_sort_engine_by_name(dico_collection_external_searches[self.id])
return websearch_templates.tmpl_searchalso(ln, engines_list, self.id)
def create_latest_additions_info(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, ln=CFG_SITE_LANG):
"""
Create info about latest additions that will be used for
create_instant_browse() later.
"""
self.latest_additions_info = []
if self.nbrecs and self.reclist:
# firstly, get last 'rg' records:
recIDs = list(self.reclist)
of = 'hb'
# CERN hack begins: tweak latest additions for selected collections:
if CFG_CERN_SITE:
# alter recIDs list for some CERN collections:
this_year = time.strftime("%Y", time.localtime())
if self.name in ['CERN Yellow Reports','Videos']:
last_year = str(int(this_year) - 1)
# detect recIDs only from this and past year:
recIDs = list(self.reclist & \
search_pattern_parenthesised(p='year:%s or year:%s' % \
(this_year, last_year)))
elif self.name in ['VideosXXX']:
# detect recIDs only from this year:
recIDs = list(self.reclist & \
search_pattern_parenthesised(p='year:%s' % this_year))
elif self.name == 'CMS Physics Analysis Summaries' and \
1281585 in self.reclist:
# REALLY, REALLY temporary hack
recIDs = list(self.reclist)
recIDs.remove(1281585)
# apply special filters:
if self.name in ['Videos']:
# select only videos with movies:
recIDs = list(intbitset(recIDs) & \
search_pattern_parenthesised(p='collection:"PUBLVIDEOMOVIE"'))
of = 'hvp'
# sort some CERN collections specially:
if self.name in ['Videos',
'Video Clips',
'Video Movies',
'Video News',
'Video Rushes',
'Webcast',
'ATLAS Videos',
'Restricted Video Movies',
'Restricted Video Rushes',
'LHC First Beam Videos',
'CERN openlab Videos']:
recIDs = sort_records(None, recIDs, '269__c')
elif self.name in ['LHCb Talks']:
recIDs = sort_records(None, recIDs, 'reportnumber')
# CERN hack ends.
total = len(recIDs)
to_display = min(rg, total)
for idx in range(total-1, total-to_display-1, -1):
recid = recIDs[idx]
self.latest_additions_info.append({'id': recid,
'format': format_record(recid, of, ln=ln),
'date': get_creation_date(recid, fmt="%Y-%m-%d " % ("Day", "Number of Queries"))
for day in range(yyyymm01, yyyymmdd + 1):
p = os.popen("grep -c ^%d %s/search.log" % (day, CFG_LOGDIR), 'r')
for line in p.readlines():
req.write("""%s %s """ % \
(day, CFG_SITE_URL, day, line))
p.close()
req.write("%s %s
%H:%i")})
return
def create_instant_browse(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG):
"Searches database and produces list of last 'rg' records."
if self.restricted_p():
return websearch_templates.tmpl_box_restricted_content(ln = ln)
if str(self.dbquery).startswith("hostedcollection:"):
return websearch_templates.tmpl_box_hosted_collection(ln = ln)
if rg == 0:
# do not show latest additions box
return ""
# CERN hack: do not display latest additions for some CERN collections:
if CFG_CERN_SITE and self.name in ['Periodicals', 'Electronic Journals',
'Press Office Photo Selection',
'Press Office Video Selection']:
return ""
try:
self.latest_additions_info
latest_additions_info_p = True
except:
latest_additions_info_p = False
if latest_additions_info_p:
passIDs = []
for idx in range(0, min(len(self.latest_additions_info), rg)):
# CERN hack: display the records in a grid layout, so do not show the related links
if CFG_CERN_SITE and self.name in ['Videos']:
passIDs.append({'id': self.latest_additions_info[idx]['id'],
'body': self.latest_additions_info[idx]['format'],
'date': self.latest_additions_info[idx]['date']})
else:
passIDs.append({'id': self.latest_additions_info[idx]['id'],
'body': self.latest_additions_info[idx]['format'] + \
websearch_templates.tmpl_record_links(recid=self.latest_additions_info[idx]['id'],
rm='citation',
ln=ln),
'date': self.latest_additions_info[idx]['date']})
if self.nbrecs > rg:
url = websearch_templates.build_search_url(
cc=self.name, jrec=rg+1, ln=ln, aas=aas)
else:
url = ""
# CERN hack: display the records in a grid layout
if CFG_CERN_SITE and self.name in ['Videos']:
return websearch_templates.tmpl_instant_browse(
aas=aas, ln=ln, recids=passIDs, more_link=url, grid_layout=True)
return websearch_templates.tmpl_instant_browse(
aas=aas, ln=ln, recids=passIDs, more_link=url)
return websearch_templates.tmpl_box_no_records(ln=ln)
def create_searchoptions(self):
"Produces 'Search options' portal box."
box = ""
query = """SELECT DISTINCT(cff.id_field),f.code,f.name FROM collection_field_fieldvalue AS cff, field AS f
WHERE cff.id_collection=%d AND cff.id_fieldvalue IS NOT NULL AND cff.id_field=f.id
ORDER BY cff.score DESC""" % self.id
res = run_sql(query)
if res:
for row in res:
field_id = row[0]
field_code = row[1]
field_name = row[2]
query_bis = """SELECT fv.value,fv.name FROM fieldvalue AS fv, collection_field_fieldvalue AS cff
WHERE cff.id_collection=%d AND cff.type='seo' AND cff.id_field=%d AND fv.id=cff.id_fieldvalue
ORDER BY cff.score_fieldvalue DESC, cff.score DESC, fv.name ASC""" % (self.id, field_id)
res_bis = run_sql(query_bis)
if res_bis:
values = [{'value' : '', 'text' : 'any' + ' ' + field_name}] # FIXME: internationalisation of "any"
for row_bis in res_bis:
values.append({'value' : cgi.escape(row_bis[0], 1), 'text' : row_bis[1]})
box += websearch_templates.tmpl_select(
fieldname = field_code,
values = values
)
return box
def create_sortoptions(self, ln=CFG_SITE_LANG):
"""Produces 'Sort options' portal box."""
# load the right message language
_ = gettext_set_language(ln)
box = ""
query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff
WHERE id_collection=%d AND cff.type='soo' AND cff.id_field=f.id
ORDER BY cff.score DESC, f.name ASC""" % self.id
values = [{'value' : '', 'text': "- %s -" % _("latest first")}]
res = run_sql(query)
if res:
for row in res:
values.append({'value' : row[0], 'text': get_field_i18nname(row[1], ln)})
else:
for tmp in ('title', 'author', 'report number', 'year'):
values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
box = websearch_templates.tmpl_select(
fieldname = 'sf',
css_class = 'address',
values = values
)
box += websearch_templates.tmpl_select(
fieldname = 'so',
css_class = 'address',
values = [
{'value' : 'a' , 'text' : _("asc.")},
{'value' : 'd' , 'text' : _("desc.")}
]
)
return box
def create_rankoptions(self, ln=CFG_SITE_LANG):
"Produces 'Rank options' portal box."
# load the right message language
_ = gettext_set_language(ln)
values = [{'value' : '', 'text': "- %s %s -" % (string.lower(_("OR")), _("rank by"))}]
for (code, name) in get_bibrank_methods(self.id, ln):
values.append({'value' : code, 'text': name})
box = websearch_templates.tmpl_select(
fieldname = 'rm',
css_class = 'address',
values = values
)
return box
def create_displayoptions(self, ln=CFG_SITE_LANG):
"Produces 'Display options' portal box."
# load the right message language
_ = gettext_set_language(ln)
values = []
for i in ['10', '25', '50', '100', '250', '500']:
values.append({'value' : i, 'text' : i + ' ' + _("results")})
box = websearch_templates.tmpl_select(
fieldname = 'rg',
css_class = 'address',
values = values
)
if self.get_sons():
box += websearch_templates.tmpl_select(
fieldname = 'sc',
css_class = 'address',
values = [
{'value' : '1' , 'text' : _("split by collection")},
{'value' : '0' , 'text' : _("single list")}
]
)
return box
def create_formatoptions(self, ln=CFG_SITE_LANG):
"Produces 'Output format options' portal box."
# load the right message language
_ = gettext_set_language(ln)
box = ""
values = []
query = """SELECT f.code,f.name FROM format AS f, collection_format AS cf
WHERE cf.id_collection=%d AND cf.id_format=f.id AND f.visibility='1'
ORDER BY cf.score DESC, f.name ASC""" % self.id
res = run_sql(query)
if res:
for row in res:
values.append({'value' : row[0], 'text': row[1]})
else:
values.append({'value' : 'hb', 'text' : "HTML %s" % _("brief")})
box = websearch_templates.tmpl_select(
fieldname = 'of',
css_class = 'address',
values = values
)
return box
def create_searchwithin_selection_box(self, fieldname='f', value='', ln='en'):
"""Produces 'search within' selection box for the current collection."""
# get values
query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff
WHERE cff.type='sew' AND cff.id_collection=%d AND cff.id_field=f.id
ORDER BY cff.score DESC, f.name ASC""" % self.id
res = run_sql(query)
values = [{'value' : '', 'text' : get_field_i18nname("any field", ln)}]
if res:
for row in res:
values.append({'value' : row[0], 'text' : get_field_i18nname(row[1], ln)})
else:
if CFG_CERN_SITE:
for tmp in ['title', 'author', 'abstract', 'report number', 'year']:
values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
else:
for tmp in ['title', 'author', 'abstract', 'keyword', 'report number', 'journal', 'year', 'fulltext', 'reference']:
values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
return websearch_templates.tmpl_searchwithin_select(
fieldname = fieldname,
ln = ln,
selected = value,
values = values
)
def create_searchexample(self):
"Produces search example(s) for the current collection."
out = "$collSearchExamples = getSearchExample(%d, $se);" % self.id
return out
def create_searchfor(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG):
"Produces either Simple or Advanced 'Search for' box for the current collection."
if aas == 1:
return self.create_searchfor_advanced(ln)
elif aas == 0:
return self.create_searchfor_simple(ln)
else:
return self.create_searchfor_light(ln)
def create_searchfor_light(self, ln=CFG_SITE_LANG):
"Produces light 'Search for' box for the current collection."
return websearch_templates.tmpl_searchfor_light(
ln=ln,
collection_id = self.name,
collection_name=self.get_name(ln=ln),
record_count=self.nbrecs,
example_search_queries=self.get_example_search_queries(),
)
def create_searchfor_simple(self, ln=CFG_SITE_LANG):
"Produces simple 'Search for' box for the current collection."
return websearch_templates.tmpl_searchfor_simple(
ln=ln,
collection_id = self.name,
collection_name=self.get_name(ln=ln),
record_count=self.nbrecs,
middle_option = self.create_searchwithin_selection_box(ln=ln),
)
def create_searchfor_advanced(self, ln=CFG_SITE_LANG):
"Produces advanced 'Search for' box for the current collection."
return websearch_templates.tmpl_searchfor_advanced(
ln = ln,
collection_id = self.name,
collection_name=self.get_name(ln=ln),
record_count=self.nbrecs,
middle_option_1 = self.create_searchwithin_selection_box('f1', ln=ln),
middle_option_2 = self.create_searchwithin_selection_box('f2', ln=ln),
middle_option_3 = self.create_searchwithin_selection_box('f3', ln=ln),
searchoptions = self.create_searchoptions(),
sortoptions = self.create_sortoptions(ln),
rankoptions = self.create_rankoptions(ln),
displayoptions = self.create_displayoptions(ln),
formatoptions = self.create_formatoptions(ln)
)
def calculate_reclist(self):
"""Calculate, set and return the (reclist, reclist_with_nonpublic_subcolls) tuple for given collection."""
if self.calculate_reclist_run_already or str(self.dbquery).startswith("hostedcollection:"):
# do we have to recalculate?
return (self.reclist, self.reclist_with_nonpublic_subcolls)
write_message("... calculating reclist of %s" % self.name, verbose=6)
reclist = intbitset() # will hold results for public sons only; good for storing into DB
reclist_with_nonpublic_subcolls = intbitset() # will hold results for both public and nonpublic sons; good for deducing total
# number of documents
if not self.dbquery:
# A - collection does not have dbquery, so query recursively all its sons
# that are either non-restricted or that have the same restriction rules
for coll in self.get_sons():
coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist()
if ((coll.restricted_p() is None) or
(coll.restricted_p() == self.restricted_p())):
# add this reclist ``for real'' only if it is public
reclist.union_update(coll_reclist)
reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls)
else:
# B - collection does have dbquery, so compute it:
# (note: explicitly remove DELETED records)
if CFG_CERN_SITE:
reclist = search_pattern_parenthesised(None, self.dbquery + \
- ' -980__:"DELETED" -980__:"DUMMY"', ap=9) #ap=9 for allow queries containing hidden tags
+ ' -980__:"DELETED" -980__:"DUMMY"', ap=-9) #ap=-9 for allow queries containing hidden tags
else:
- reclist = search_pattern_parenthesised(None, self.dbquery + ' -980__:"DELETED"', ap=9) #ap=9 allow queries containing hidden tags
+ reclist = search_pattern_parenthesised(None, self.dbquery + ' -980__:"DELETED"', ap=-9) #ap=-9 allow queries containing hidden tags
reclist_with_nonpublic_subcolls = copy.deepcopy(reclist)
# store the results:
self.nbrecs = len(reclist_with_nonpublic_subcolls)
self.reclist = reclist
self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls
# last but not least, update the speed-up flag:
self.calculate_reclist_run_already = 1
# return the two sets:
return (self.reclist, self.reclist_with_nonpublic_subcolls)
def calculate_nbrecs_for_external_collection(self, timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
"""Calculate the total number of records, aka nbrecs, for given external collection."""
#if self.calculate_reclist_run_already:
# do we have to recalculate?
#return self.nbrecs
#write_message("... calculating nbrecs of external collection %s" % self.name, verbose=6)
if external_collections_dictionary.has_key(self.name):
engine = external_collections_dictionary[self.name]
if engine.parser:
self.nbrecs_tmp = engine.parser.parse_nbrecs(timeout)
if self.nbrecs_tmp >= 0: return self.nbrecs_tmp
# the parse_nbrecs() function returns negative values for some specific cases
# maybe we can handle these specific cases, some warnings or something
# for now the total number of records remains silently the same
else: return self.nbrecs
else: write_message("External collection %s does not have a parser!" % self.name, verbose=6)
else: write_message("External collection %s not found!" % self.name, verbose=6)
return 0
# last but not least, update the speed-up flag:
#self.calculate_reclist_run_already = 1
def check_nbrecs_for_external_collection(self):
"""Check if the external collections has changed its total number of records, aka nbrecs.
Rerurns True if the total number of records has changed and False if it's the same"""
write_message("*** self.nbrecs = %s / self.cal...ion = %s ***" % (str(self.nbrecs), str(self.calculate_nbrecs_for_external_collection())), verbose=6)
write_message("*** self.nbrecs != self.cal...ion = %s ***" % (str(self.nbrecs != self.calculate_nbrecs_for_external_collection()),), verbose=6)
return self.nbrecs != self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS)
def set_nbrecs_for_external_collection(self):
"""Set this external collection's total number of records, aka nbrecs"""
if self.calculate_reclist_run_already:
# do we have to recalculate?
return
write_message("... calculating nbrecs of external collection %s" % self.name, verbose=6)
if self.nbrecs_tmp:
self.nbrecs = self.nbrecs_tmp
else:
self.nbrecs = self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS)
# last but not least, update the speed-up flag:
self.calculate_reclist_run_already = 1
def update_reclist(self):
"Update the record universe for given collection; nbrecs, reclist of the collection table."
if self.update_reclist_run_already:
# do we have to reupdate?
return 0
write_message("... updating reclist of %s (%s recs)" % (self.name, self.nbrecs), verbose=6)
sys.stdout.flush()
try:
run_sql("UPDATE collection SET nbrecs=%s, reclist=%s WHERE id=%s",
(self.nbrecs, self.reclist.fastdump(), self.id))
self.reclist_updated_since_start = 1
except Error, e:
print "Database Query Error %d: %s." % (e.args[0], e.args[1])
sys.exit(1)
# last but not least, update the speed-up flag:
self.update_reclist_run_already = 1
return 0
def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
"""Returns a date string according to the format string.
It can handle normal date strings and shifts with respect
to now."""
date = time.time()
shift_re = re.compile("([-\+]{0,1})([\d]+)([dhms])")
factors = {"d":24*3600, "h":3600, "m":60, "s":1}
m = shift_re.match(var)
if m:
sign = m.groups()[0] == "-" and -1 or 1
factor = factors[m.groups()[2]]
value = float(m.groups()[1])
date = time.localtime(date + sign * factor * value)
date = time.strftime(format_string, date)
else:
date = time.strptime(var, format_string)
date = time.strftime(format_string, date)
return date
def get_current_time_timestamp():
"""Return timestamp corresponding to the current time."""
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
def compare_timestamps_with_tolerance(timestamp1,
timestamp2,
tolerance=0):
"""Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form
'2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument
(in seconds). Return -1 if TIMESTAMP1 is less than TIMESTAMP2
minus TOLERANCE, 0 if they are equal within TOLERANCE limit,
and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE.
"""
# remove any trailing .00 in timestamps:
timestamp1 = re.sub(r'\.[0-9]+$', '', timestamp1)
timestamp2 = re.sub(r'\.[0-9]+$', '', timestamp2)
# first convert timestamps to Unix epoch seconds:
timestamp1_seconds = calendar.timegm(time.strptime(timestamp1, "%Y-%m-%d %H:%M:%S"))
timestamp2_seconds = calendar.timegm(time.strptime(timestamp2, "%Y-%m-%d %H:%M:%S"))
# now compare them:
if timestamp1_seconds < timestamp2_seconds - tolerance:
return -1
elif timestamp1_seconds > timestamp2_seconds + tolerance:
return 1
else:
return 0
def get_database_last_updated_timestamp():
"""Return last updated timestamp for collection-related and
record-related database tables.
"""
database_tables_timestamps = []
database_tables_timestamps.append(get_table_update_time('bibrec'))
database_tables_timestamps.append(get_table_update_time('bibfmt'))
try:
database_tables_timestamps.append(get_table_update_time('idxWORD%'))
except ValueError:
# There are no indexes in the database. That's OK.
pass
database_tables_timestamps.append(get_table_update_time('collection%'))
database_tables_timestamps.append(get_table_update_time('portalbox'))
database_tables_timestamps.append(get_table_update_time('field%'))
database_tables_timestamps.append(get_table_update_time('format%'))
database_tables_timestamps.append(get_table_update_time('rnkMETHODNAME'))
database_tables_timestamps.append(get_table_update_time('accROLE_accACTION_accARGUMENT', run_on_slave=True))
return max(database_tables_timestamps)
def get_cache_last_updated_timestamp():
"""Return last updated cache timestamp."""
try:
f = open(CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE, "r")
except:
return "1970-01-01 00:00:00"
timestamp = f.read()
f.close()
return timestamp
def set_cache_last_updated_timestamp(timestamp):
"""Set last updated cache timestamp to TIMESTAMP."""
try:
f = open(CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE, "w")
except:
pass
f.write(timestamp)
f.close()
return timestamp
def main():
"""Main that construct all the bibtask."""
task_init(authorization_action="runwebcoll",
authorization_msg="WebColl Task Submission",
description="""Description:
webcoll updates the collection cache (record universe for a
given collection plus web page elements) based on invenio.conf and DB
configuration parameters. If the collection name is passed as an argument,
only this collection's cache will be updated. If the recursive option is
set as well, the collection's descendants will also be updated.\n""",
help_specific_usage=" -c, --collection\t Update cache for the given "
"collection only. [all]\n"
" -r, --recursive\t Update cache for the given collection and all its\n"
"\t\t\t descendants (to be used in combination with -c). [no]\n"
" -f, --force\t\t Force update even if cache is up to date. [no]\n"
" -p, --part\t\t Update only certain cache parts (1=reclist,"
" 2=webpage). [both]\n"
" -l, --language\t Update pages in only certain language"
" (e.g. fr,it,...). [all]\n",
version=__revision__,
specific_params=("c:rfp:l:", [
"collection=",
"recursive",
"force",
"part=",
"language="
]),
task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter,
task_submit_check_options_fnc=task_submit_check_options,
task_run_fnc=task_run_core)
def task_submit_elaborate_specific_parameter(key, value, opts, args):
""" Given the string key it checks it's meaning, eventually using the value.
Usually it fills some key in the options dict.
It must return True if it has elaborated the key, False, if it doesn't
know that key.
eg:
if key in ['-n', '--number']:
self.options['number'] = value
return True
return False
"""
if key in ("-c", "--collection"):
task_set_option("collection", value)
elif key in ("-r", "--recursive"):
task_set_option("recursive", 1)
elif key in ("-f", "--force"):
task_set_option("force", 1)
elif key in ("-p", "--part"):
task_set_option("part", int(value))
elif key in ("-l", "--language"):
languages = task_get_option("language", [])
languages += value.split(',')
for ln in languages:
if ln not in CFG_SITE_LANGS:
print 'ERROR: "%s" is not a recognized language code' % ln
return False
task_set_option("language", languages)
else:
return False
return True
def task_submit_check_options():
if task_has_option('collection'):
coll = get_collection(task_get_option("collection"))
if coll.id is None:
print 'ERROR: Collection "%s" does not exist' % coll.name
return False
return True
def task_run_core():
""" Reimplement to add the body of the task."""
##
## ------->--->time--->------>
## (-1) | ( 0) | ( 1)
## | | |
## [T.db] | [T.fc] | [T.db]
## | | |
## |<-tol|tol->|
##
## the above is the compare_timestamps_with_tolerance result "diagram"
## [T.db] stands fore the database timestamp and [T.fc] for the file cache timestamp
## ( -1, 0, 1) stand for the returned value
## tol stands for the tolerance in seconds
##
## When a record has been added or deleted from one of the collections the T.db becomes greater that the T.fc
## and when webcoll runs it is fully ran. It recalculates the reclists and nbrecs, and since it updates the
## collections db table it also updates the T.db. The T.fc is set as the moment the task started running thus
## slightly before the T.db (practically the time distance between the start of the task and the last call of
## update_reclist). Therefore when webcoll runs again, and even if no database changes have taken place in the
## meanwhile, it fully runs (because compare_timestamps_with_tolerance returns 0). This time though, and if
## no databases changes have taken place, the T.db remains the same while T.fc is updated and as a result if
## webcoll runs again it will not be fully ran
##
task_run_start_timestamp = get_current_time_timestamp()
colls = []
# decide whether we need to run or not, by comparing last updated timestamps:
write_message("Database timestamp is %s." % get_database_last_updated_timestamp(), verbose=3)
write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp(), verbose=3)
if task_has_option("part"):
write_message("Running cache update part %s only." % task_get_option("part"), verbose=3)
if check_nbrecs_for_all_external_collections() or task_has_option("force") or \
compare_timestamps_with_tolerance(get_database_last_updated_timestamp(),
get_cache_last_updated_timestamp(),
CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE) >= 0:
## either forced update was requested or cache is not up to date, so recreate it:
# firstly, decide which collections to do:
if task_has_option("collection"):
coll = get_collection(task_get_option("collection"))
colls.append(coll)
if task_has_option("recursive"):
r_type_descendants = coll.get_descendants(type='r')
colls += r_type_descendants
v_type_descendants = coll.get_descendants(type='v')
colls += v_type_descendants
else:
res = run_sql("SELECT name FROM collection ORDER BY id")
for row in res:
colls.append(get_collection(row[0]))
# secondly, update collection reclist cache:
if task_get_option('part', 1) == 1:
i = 0
for coll in colls:
i += 1
write_message("%s / reclist cache update" % coll.name)
if str(coll.dbquery).startswith("hostedcollection:"):
coll.set_nbrecs_for_external_collection()
else:
coll.calculate_reclist()
task_sleep_now_if_required()
coll.update_reclist()
task_update_progress("Part 1/2: done %d/%d" % (i, len(colls)))
task_sleep_now_if_required(can_stop_too=True)
# thirdly, update collection webpage cache:
if task_get_option("part", 2) == 2:
i = 0
for coll in colls:
i += 1
write_message("%s / webpage cache update" % coll.name)
coll.update_webpage_cache()
task_update_progress("Part 2/2: done %d/%d" % (i, len(colls)))
task_sleep_now_if_required(can_stop_too=True)
# finally update the cache last updated timestamp:
# (but only when all collections were updated, not when only
# some of them were forced-updated as per admin's demand)
if not task_has_option("collection"):
set_cache_last_updated_timestamp(task_run_start_timestamp)
write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp(), verbose=3)
else:
## cache up to date, we don't have to run
write_message("Collection cache is up to date, no need to run.")
## we are done:
return True
### okay, here we go:
if __name__ == '__main__':
main()