diff --git a/config/invenio.conf b/config/invenio.conf
index 0e0350975..0e2a3fd57 100644
--- a/config/invenio.conf
+++ b/config/invenio.conf
@@ -1,1005 +1,1013 @@
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
###################################################
## About 'invenio.conf' and 'invenio-local.conf' ##
###################################################
## The 'invenio.conf' file contains the vanilla default configuration
## parameters of a CDS Invenio installation, as coming from the
## distribution. The file should be self-explanatory. Once installed
## in its usual location (usually /opt/cds-invenio/etc), you could in
## principle go ahead and change the values according to your local
## needs.
##
## However, you can also create a file named 'invenio-local.conf' in
## the same directory where 'invenio.conf' lives and put there only
## the localizations you need to have different from the default ones.
## For example:
##
## $ cat /opt/cds-invenio/etc/invenio-local.conf
## [Invenio]
## CFG_SITE_URL = http://your.site.com
## CFG_SITE_SECURE_URL = https://your.site.com
## CFG_SITE_ADMIN_EMAIL = john.doe@your.site.com
## CFG_SITE_SUPPORT_EMAIL = john.doe@your.site.com
##
## The Invenio system will then read both the default invenio.conf
## file and your customized invenio-local.conf file and it will
## override any default options with the ones you have set in your
## local file. This cascading of configuration parameters will ease
## you future upgrades.
[Invenio]
###################################
## Part 1: Essential parameters ##
###################################
## This part defines essential CDS Invenio internal parameters that
## everybody should override, like the name of the server or the email
## address of the local CDS Invenio administrator.
## CFG_DATABASE_* - specify which MySQL server to use, the name of the
## database to use, and the database access credentials.
CFG_DATABASE_HOST = localhost
CFG_DATABASE_PORT = 3306
CFG_DATABASE_NAME = cdsinvenio
CFG_DATABASE_USER = cdsinvenio
CFG_DATABASE_PASS = my123p$ss
## CFG_SITE_URL - specify URL under which your installation will be
## visible. For example, use "http://your.site.com". Do not leave
## trailing slash.
CFG_SITE_URL = http://localhost
## CFG_SITE_SECURE_URL - specify secure URL under which your
## installation secure pages such as login or registration will be
## visible. For example, use "https://your.site.com". Do not leave
## trailing slash. If you don't plan on using HTTPS, then you may
## leave this empty.
CFG_SITE_SECURE_URL = https://localhost
## CFG_SITE_NAME -- the visible name of your CDS Invenio installation.
CFG_SITE_NAME = Atlantis Institute of Fictive Science
## CFG_SITE_NAME_INTL -- the international versions of CFG_SITE_NAME
## in various languages. (See also CFG_SITE_LANGS below.)
CFG_SITE_NAME_INTL_en = Atlantis Institute of Fictive Science
CFG_SITE_NAME_INTL_fr = Atlantis Institut des Sciences Fictives
CFG_SITE_NAME_INTL_de = Atlantis Institut der fiktiven Wissenschaft
CFG_SITE_NAME_INTL_es = Atlantis Instituto de la Ciencia Fictive
CFG_SITE_NAME_INTL_ca = Institut Atlantis de Ciència Fictícia
CFG_SITE_NAME_INTL_pt = Instituto Atlantis de Ciência Fictícia
CFG_SITE_NAME_INTL_it = Atlantis Istituto di Scienza Fittizia
CFG_SITE_NAME_INTL_ru = Атлантис Институт фиктивных Наук
CFG_SITE_NAME_INTL_sk = Atlantis Inštitút Fiktívnych Vied
CFG_SITE_NAME_INTL_cs = Atlantis Institut Fiktivních Věd
CFG_SITE_NAME_INTL_no = Atlantis Institutt for Fiktiv Vitenskap
CFG_SITE_NAME_INTL_sv = Atlantis Institut för Fiktiv Vetenskap
CFG_SITE_NAME_INTL_el = Ινστιτούτο Φανταστικών Επιστημών Ατλαντίδος
CFG_SITE_NAME_INTL_uk = Інститут вигаданих наук в Атлантісі
CFG_SITE_NAME_INTL_ja = Fictive 科学のAtlantis の協会
CFG_SITE_NAME_INTL_pl = Instytut Fikcyjnej Nauki Atlantis
CFG_SITE_NAME_INTL_bg = Институт за фиктивни науки Атлантис
CFG_SITE_NAME_INTL_hr = Institut Fiktivnih Znanosti Atlantis
CFG_SITE_NAME_INTL_zh_CN = 阿特兰提斯虚拟科学学院
CFG_SITE_NAME_INTL_zh_TW = 阿特蘭提斯虛擬科學學院
CFG_SITE_NAME_INTL_hu = Kitalált Tudományok Atlantiszi Intézete
CFG_SITE_NAME_INTL_af = Atlantis Instituut van Fiktiewe Wetenskap
CFG_SITE_NAME_INTL_gl = Instituto Atlantis de Ciencia Fictive
CFG_SITE_NAME_INTL_ro = Institutul Atlantis al Ştiinţelor Fictive
CFG_SITE_NAME_INTL_rw = Atlantis Ishuri Rikuru Ry'ubuhanga
## CFG_SITE_LANG -- the default language of the interface:
CFG_SITE_LANG = en
## CFG_SITE_LANGS -- list of all languages the user interface should
## be available in, separated by commas. The order specified below
## will be respected on the interface pages. A good default would be
## to use the alphabetical order. Currently supported languages
## include Afrikaans, Bulgarian, Catalan, Czech, German, Greek,
## English, Spanish, French, Croatian, Hungarian, Galician, Italian,
## Japanese, Kinyarwanda, Norwegian, Polish, Portuguese, Romanian,
## Russian, Slovak, Swedish, Ukrainian, Chinese (China), Chinese
## (Taiwan), so that the eventual maximum you can currently select is
## "af,bg,ca,cs,de,el,en,es,fr,hr,gl,it,rw,hu,ja,no,pl,pt,ro,ru,sk,sv,uk,zh_CN,zh_TW".
CFG_SITE_LANGS = af,bg,ca,cs,de,el,en,es,fr,hr,gl,it,rw,hu,ja,no,pl,pt,ro,ru,sk,sv,uk,zh_CN,zh_TW
## CFG_SITE_SUPPORT_EMAIL -- the email address of the support team for
## this installation:
CFG_SITE_SUPPORT_EMAIL = cds.support@cern.ch
## CFG_SITE_ADMIN_EMAIL -- the email address of the 'superuser' for
## this installation. Enter your email address below and login with
## this address when using CDS Invenio administration modules. You
## will then be automatically recognized as superuser of the system.
CFG_SITE_ADMIN_EMAIL = cds.support@cern.ch
## CFG_SITE_EMERGENCY_PHONE_NUMBERS -- list of mobile phone numbers to
## which an sms should be sent in case of emergency (e.g. bibsched queue
## has been stopped because of an error).
## Note that in order to use this function, if CFG_CERN_SITE is set to 0,
## the function send_sms in errorlib should be reimplemented.
CFG_SITE_EMERGENCY_PHONE_NUMBERS =
## CFG_CERN_SITE -- do we want to enable CERN-specific code?
## Put "1" for "yes" and "0" for "no".
CFG_CERN_SITE = 0
## CFG_INSPIRE_SITE -- do we want to enable INSPIRE-specific code?
## Put "1" for "yes" and "0" for "no".
CFG_INSPIRE_SITE = 0
## CFG_ADS_SITE -- do we want to enable ADS-specific code?
## Put "1" for "yes" and "0" for "no".
CFG_ADS_SITE = 0
## CFG_DEVEL_SITE -- is this a development site? If it is, you might
## prefer that it doesn't do certain things. For example, you might
## not want WebSubmit to send certain emails or trigger certain
## processes on a development site.
## Put "1" for "yes" (this is a development site) or "0" for "no"
## (this isn't a development site.)
CFG_DEVEL_SITE = 0
################################
## Part 2: Web page style ##
################################
## The variables affecting the page style. The most important one is
## the 'template skin' you would like to use and the obfuscation mode
## for your email addresses. Please refer to the WebStyle Admin Guide
## for more explanation. The other variables are listed here mostly
## for backwards compatibility purposes only.
## CFG_WEBSTYLE_TEMPLATE_SKIN -- what template skin do you want to
## use?
CFG_WEBSTYLE_TEMPLATE_SKIN = default
## CFG_WEBSTYLE_EMAIL_ADDRESSES_OBFUSCATION_MODE. How do we "protect"
## email addresses from undesired automated email harvesters? This
## setting will not affect 'support' and 'admin' emails.
## NOTE: there is no ultimate solution to protect against email
## harvesting. All have drawbacks and can more or less be
## circumvented. Choose you preferred mode ([t] means "transparent"
## for the user):
## -1: hide all emails.
## [t] 0 : no protection, email returned as is.
## foo@example.com => foo@example.com
## 1 : basic email munging: replaces @ by [at] and . by [dot]
## foo@example.com => foo [at] example [dot] com
## [t] 2 : transparent name mangling: characters are replaced by
## equivalent HTML entities.
## foo@example.com => foo@example.com
## [t] 3 : javascript insertion. Requires Javascript enabled on client
## side.
## 4 : replaces @ and . characters by gif equivalents.
## foo@example.com => fooexamplecom
CFG_WEBSTYLE_EMAIL_ADDRESSES_OBFUSCATION_MODE = 2
## CFG_WEBSTYLE_INSPECT_TEMPLATES -- Do we want to debug all template
## functions so that they would return HTML results wrapped in
## comments indicating which part of HTML page was created by which
## template function? Useful only for debugging Pythonic HTML
## template. See WebStyle Admin Guide for more information.
CFG_WEBSTYLE_INSPECT_TEMPLATES = 0
## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXLEFTTOP -- eventual global HTML
## left top box:
CFG_WEBSTYLE_CDSPAGEBOXLEFTTOP =
## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXLEFTBOTTOM -- eventual global
## HTML left bottom box:
CFG_WEBSTYLE_CDSPAGEBOXLEFTBOTTOM =
## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXRIGHTTOP -- eventual global
## HTML right top box:
CFG_WEBSTYLE_CDSPAGEBOXRIGHTTOP =
## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXRIGHTBOTTOM -- eventual global
## HTML right bottom box:
CFG_WEBSTYLE_CDSPAGEBOXRIGHTBOTTOM =
## CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST -- when certain HTTP status
## codes are raised to the WSGI handler, the corresponding exceptions
## and error messages can be sent to the system administrator for
## inspecting. This is useful to detect and correct errors. The
## variable represents a comma-separated list of HTTP statuses that
## should alert admin. Wildcards are possible. If the status is
## followed by an "r", it means that a referer is required to exist
## (useful to distinguish broken known links from URL typos when 404
## errors are raised).
CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST = 404r,400,5*,41*
##################################
## Part 3: WebSearch parameters ##
##################################
## This section contains some configuration parameters for WebSearch
## module. Please note that WebSearch is mostly configured on
## run-time via its WebSearch Admin web interface. The parameters
## below are the ones that you do not probably want to modify very
## often during the runtime. (Note that you may modify them
## afterwards too, though.)
## CFG_WEBSEARCH_SEARCH_CACHE_SIZE -- how many queries we want to
## cache in memory per one Apache httpd process? This cache is used
## mainly for "next/previous page" functionality, but it caches also
## "popular" user queries if more than one user happen to search for
## the same thing. Note that large numbers may lead to great memory
## consumption. We recommend a value not greater than 100.
CFG_WEBSEARCH_SEARCH_CACHE_SIZE = 100
## CFG_WEBSEARCH_FIELDS_CONVERT -- if you migrate from an older
## system, you may want to map field codes of your old system (such as
## 'ti') to CDS Invenio/MySQL ("title"). Use Python dictionary syntax
## for the translation table, e.g. {'wau':'author', 'wti':'title'}.
## Usually you don't want to do that, and you would use empty dict {}.
CFG_WEBSEARCH_FIELDS_CONVERT = {}
## CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH -- width of the
## search pattern window in the light search interface, in
## characters. CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH = 60
CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH = 60
## CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH -- width of the search
## pattern window in the simple search interface, in characters.
CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH = 40
## CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH -- width of the
## search pattern window in the advanced search interface, in
## characters.
CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH = 30
## CFG_WEBSEARCH_NB_RECORDS_TO_SORT -- how many records do we still
## want to sort? For higher numbers we print only a warning and won't
## perform any sorting other than default 'latest records first', as
## sorting would be very time consuming then. We recommend a value of
## not more than a couple of thousands.
CFG_WEBSEARCH_NB_RECORDS_TO_SORT = 1000
## CFG_WEBSEARCH_CALL_BIBFORMAT -- if a record is being displayed but
## it was not preformatted in the "HTML brief" format, do we want to
## call BibFormatting on the fly? Put "1" for "yes" and "0" for "no".
## Note that "1" will display the record exactly as if it were fully
## preformatted, but it may be slow due to on-the-fly processing; "0"
## will display a default format very fast, but it may not have all
## the fields as in the fully preformatted HTML brief format. Note
## also that this option is active only for old (PHP) formats; the new
## (Python) formats are called on the fly by default anyway, since
## they are much faster. When usure, please set "0" here.
CFG_WEBSEARCH_CALL_BIBFORMAT = 0
## CFG_WEBSEARCH_USE_ALEPH_SYSNOS -- do we want to make old SYSNOs
## visible rather than MySQL's record IDs? You may use this if you
## migrate from a different e-doc system, and you store your old
## system numbers into 970__a. Put "1" for "yes" and "0" for
## "no". Usually you don't want to do that, though.
CFG_WEBSEARCH_USE_ALEPH_SYSNOS = 0
## CFG_WEBSEARCH_I18N_LATEST_ADDITIONS -- Put "1" if you want the
## "Latest Additions" in the web collection pages to show
## internationalized records. Useful only if your brief BibFormat
## templates contains internationalized strings. Otherwise put "0" in
## order not to slow down the creation of latest additions by WebColl.
CFG_WEBSEARCH_I18N_LATEST_ADDITIONS = 0
## CFG_WEBSEARCH_INSTANT_BROWSE -- the number of records to display
## under 'Latest Additions' in the web collection pages.
CFG_WEBSEARCH_INSTANT_BROWSE = 10
## CFG_WEBSEARCH_INSTANT_BROWSE_RSS -- the number of records to
## display in the RSS feed.
CFG_WEBSEARCH_INSTANT_BROWSE_RSS = 25
## CFG_WEBSEARCH_RSS_I18N_COLLECTIONS -- comma-separated list of
## collections that feature an internationalized RSS feed on their
## main seach interface page created by webcoll. Other collections
## will have RSS feed using CFG_SITE_LANG.
CFG_WEBSEARCH_RSS_I18N_COLLECTIONS =
## CFG_WEBSEARCH_RSS_TTL -- number of minutes that indicates how long
## a feed cache is valid.
CFG_WEBSEARCH_RSS_TTL = 360
## CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS -- maximum number of request kept
## in cache. If the cache is filled, following request are not cached.
CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS = 1000
## CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD -- up to how many author names
## to print explicitely; for more print "et al". Note that this is
## used in default formatting that is seldomly used, as usually
## BibFormat defines all the format. The value below is only used
## when BibFormat fails, for example.
CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD = 3
## CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS -- whether to show or
## not collection grandsons in Narrow Search boxes (sons are shown by
## default, grandsons are configurable here). Use 0 for no and 1 for
## yes.
CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS = 1
## CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX -- shall we
## create help links for Ellis, Nick or Ellis, Nicholas and friends
## when Ellis, N was searched for? Useful if you have one author
## stored in the database under several name formats, namely surname
## comma firstname and surname comma initial cataloging policy. Use 0
## for no and 1 for yes.
CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX = 1
## CFG_WEBSEARCH_USE_JSMATH_FOR_FORMATS -- jsMath is a JavaScript
## library that renders (La)TeX mathematical formulas in the client
## browser. This parameter must contain a comma-separated list of
## output formats for which to apply the jsMath rendering, for example
## "hb,hd". If the list is empty, jsMath is disabled.
CFG_WEBSEARCH_USE_JSMATH_FOR_FORMATS =
## CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT -- when searching
## external collections (e.g. SPIRES, CiteSeer, etc), how many seconds
## do we wait for reply before abandonning?
CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT = 5
## CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_MAXRESULTS -- how many
## results do we fetch?
CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_MAXRESULTS = 10
## CFG_WEBSEARCH_SPLIT_BY_COLLECTION -- do we want to split the search
## results by collection or not? Use 0 for not, 1 for yes.
CFG_WEBSEARCH_SPLIT_BY_COLLECTION = 1
## CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS -- in order to limit denial of
## service attacks the total number of records per group displayed as a
## result of a search query will be limited to this number. Only the superuser
## queries will not be affected by this limit.
CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS = 200
## CFG_WEBSEARCH_PERMITTED_RESTRICTED_COLLECTIONS_LEVEL -- logged in users
## might have rights to access some restricted collections. This variable
## tweaks the kind of support the system will automatically provide to the
## user with respect to searching into these restricted collections.
## Set this to 0 in order to have the user to explicitly activate restricted
## collections in order to search into them. Set this to 1 in order to
## propose to the user the list of restricted collections to which he/she has
## rights (note: this is not yet implemented). Set this to 2 in order to
## silently add all the restricted collections to which the user has rights to
## to any query.
## Note: the system will discover which restricted collections a user has
## rights to, at login time. The time complexity of this procedure is
## proportional to the number of restricted collections. E.g. for a system
## with ~50 restricted collections, you might expect ~1s of delay in the
## login time, when this variable is set to a value higher than 0.
CFG_WEBSEARCH_PERMITTED_RESTRICTED_COLLECTIONS_LEVEL = 0
## CFG_WEBSEARCH_SHOW_COMMENT_COUNT -- do we want to show the 'N comments'
## links on the search engine pages? (useful only when you have allowed
## commenting)
CFG_WEBSEARCH_SHOW_COMMENT_COUNT = 1
## CFG_WEBSEARCH_SHOW_REVIEW_COUNT -- do we want to show the 'N reviews'
## links on the search engine pages? (useful only when you have allowed
## reviewing)
CFG_WEBSEARCH_SHOW_REVIEW_COUNT = 1
#######################################
## Part 4: BibHarvest OAI parameters ##
#######################################
## This part defines parameters for the CDS Invenio OAI gateway.
## Useful if you are running CDS Invenio as OAI data provider.
## CFG_OAI_ID_FIELD -- OAI identifier MARC field:
CFG_OAI_ID_FIELD = 909COo
## CFG_OAI_SET_FIELD -- OAI set MARC field:
CFG_OAI_SET_FIELD = 909COp
## CFG_OAI_DELETED_POLICY -- OAI deletedrecordspolicy
## (no/transient/persistent).
CFG_OAI_DELETED_POLICY = no
## CFG_OAI_ID_PREFIX -- OAI identifier prefix:
CFG_OAI_ID_PREFIX = atlantis.cern.ch
## CFG_OAI_SAMPLE_IDENTIFIER -- OAI sample identifier:
CFG_OAI_SAMPLE_IDENTIFIER = oai:atlantis.cern.ch:CERN-TH-4036
## CFG_OAI_IDENTIFY_DESCRIPTION -- description for the OAI Identify verb:
CFG_OAI_IDENTIFY_DESCRIPTION = oaiatlantis.cern.ch:oai:atlantis.cern.ch:CERN-TH-4036http://atlantis.cern.ch/Free and unlimited use by anybody with obligation to refer to original recordFull content, i.e. preprints may not be harvested by robotsSubmission restricted. Submitted documents are subject of approval by OAI repository admins.
## CFG_OAI_LOAD -- OAI number of records in a response:
CFG_OAI_LOAD = 1000
## CFG_OAI_EXPIRE -- OAI resumptionToken expiration time:
CFG_OAI_EXPIRE = 90000
## CFG_OAI_SLEEP -- service unavailable between two consecutive
## requests for CFG_OAI_SLEEP seconds:
CFG_OAI_SLEEP = 10
##################################
## Part 5: WebSubmit parameters ##
##################################
## This section contains some configuration parameters for WebSubmit
## module. Please note that WebSubmit is mostly configured on
## run-time via its WebSubmit Admin web interface. The parameters
## below are the ones that you do not probably want to modify during
## the runtime.
## CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT -- the fulltext
## documents are stored under "/opt/cds-invenio/var/data/files/gX/Y"
## directories where X is 0,1,... and Y stands for bibdoc ID. Thusly
## documents Y are grouped into directories X and this variable
## indicates the maximum number of documents Y stored in each
## directory X. This limit is imposed solely for filesystem
## performance reasons in order not to have too many subdirectories in
## a given directory.
CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT = 5000
## CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS -- a comma-separated
## list of document extensions not listed in Python standard mimetype
## library that should be recognized by Invenio.
CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS = hpg,link,lis,llb,mat,mpp,msg,docx,docm,xlsx,xlsm,xlsb,pptx,pptm,ppsx,ppsm
## CFG_BIBDOCFILE_USE_XSENDFILE -- if your web server supports
## XSendfile header, you may want to enable this feature in order for
## to Invenio tell the web server to stream files for download (after
## proper authorization checks) by web server's means. This helps to
## liberate Invenio worker processes from being busy with sending big
## files to clients. The web server will take care of that. Note:
## this feature is still somewhat experimental. Note: when enabled
## (set to 1), then you have to also regenerate Apache vhost conf
## snippets (inveniocfg --update-config-py --create-apache-conf).
CFG_BIBDOCFILE_USE_XSENDFILE = 0
## CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY -- a number between 0 and
## 1 that indicates probability with which MD5 checksum will be
## verified when streaming bibdocfile-managed files. (0.1 will cause
## the check to be performed once for every 10 downloads)
CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY = 0.1
#################################
## Part 6: BibIndex parameters ##
#################################
## This section contains some configuration parameters for BibIndex
## module. Please note that BibIndex is mostly configured on run-time
## via its BibIndex Admin web interface. The parameters below are the
## ones that you do not probably want to modify very often during the
## runtime.
## CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY -- when fulltext indexing, do
## you want to index locally stored files only, or also external URLs?
## Use "0" to say "no" and "1" to say "yes".
CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY = 0
## CFG_BIBINDEX_REMOVE_STOPWORDS -- when indexing, do we want to remove
## stopwords? Use "0" to say "no" and "1" to say "yes".
CFG_BIBINDEX_REMOVE_STOPWORDS = 0
## CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS -- characters considered as
## alphanumeric separators of word-blocks inside words. You probably
## don't want to change this.
CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS = \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~
## CFG_BIBINDEX_CHARS_PUNCTUATION -- characters considered as punctuation
## between word-blocks inside words. You probably don't want to
## change this.
CFG_BIBINDEX_CHARS_PUNCTUATION = \.\,\:\;\?\!\"
## CFG_BIBINDEX_REMOVE_HTML_MARKUP -- should we attempt to remove HTML markup
## before indexing? Use 1 if you have HTML markup inside metadata
## (e.g. in abstracts), use 0 otherwise.
CFG_BIBINDEX_REMOVE_HTML_MARKUP = 0
## CFG_BIBINDEX_REMOVE_LATEX_MARKUP -- should we attempt to remove LATEX markup
## before indexing? Use 1 if you have LATEX markup inside metadata
## (e.g. in abstracts), use 0 otherwise.
CFG_BIBINDEX_REMOVE_LATEX_MARKUP = 0
## CFG_BIBINDEX_MIN_WORD_LENGTH -- minimum word length allowed to be added to
## index. The terms smaller then this amount will be discarded.
## Useful to keep the database clean, however you can safely leave
## this value on 0 for up to 1,000,000 documents.
CFG_BIBINDEX_MIN_WORD_LENGTH = 0
## CFG_BIBINDEX_URLOPENER_USERNAME and CFG_BIBINDEX_URLOPENER_PASSWORD --
## access credentials to access restricted URLs, interesting only if
## you are fulltext-indexing files located on a remote server that is
## only available via username/password. But it's probably better to
## handle this case via IP or some convention; the current scheme is
## mostly there for demo only.
CFG_BIBINDEX_URLOPENER_USERNAME = mysuperuser
CFG_BIBINDEX_URLOPENER_PASSWORD = mysuperpass
## CFG_INTBITSET_ENABLE_SANITY_CHECKS --
## Enable sanity checks for integers passed to the intbitset data
## structures. It is good to enable this during debugging
## and to disable this value for speed improvements.
CFG_INTBITSET_ENABLE_SANITY_CHECKS = False
#######################################
## Part 7: Access control parameters ##
#######################################
## This section contains some configuration parameters for the access
## control system. Please note that WebAccess is mostly configured on
## run-time via its WebAccess Admin web interface. The parameters
## below are the ones that you do not probably want to modify very
## often during the runtime. (If you do want to modify them during
## runtime, for example te deny access temporarily because of backups,
## you can edit access_control_config.py directly, no need to get back
## here and no need to redo the make process.)
## CFG_ACCESS_CONTROL_LEVEL_SITE -- defines how open this site is.
## Use 0 for normal operation of the site, 1 for read-only site (all
## write operations temporarily closed), 2 for site fully closed,
## 3 for also disabling any database connection.
## Useful for site maintenance.
CFG_ACCESS_CONTROL_LEVEL_SITE = 0
## CFG_ACCESS_CONTROL_LEVEL_GUESTS -- guest users access policy. Use
## 0 to allow guest users, 1 not to allow them (all users must login).
CFG_ACCESS_CONTROL_LEVEL_GUESTS = 0
## CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS -- account registration and
## activation policy. When 0, users can register and accounts are
## automatically activated. When 1, users can register but admin must
## activate the accounts. When 2, users cannot register nor update
## their email address, only admin can register accounts. When 3,
## users cannot register nor update email address nor password, only
## admin can register accounts. When 4, the same as 3 applies, nor
## user cannot change his login method. When 5, then the same as 4
## applies, plus info about how to get an account is hidden from the
## login page.
CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS = 0
## CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN -- limit account
## registration to certain email addresses? If wanted, give domain
## name below, e.g. "cern.ch". If not wanted, leave it empty.
CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN =
## CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS -- send a
## notification email to the administrator when a new account is
## created? Use 0 for no, 1 for yes.
CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS = 0
## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT -- send a
## notification email to the user when a new account is created in order to
## to verify the validity of the provided email address? Use
## 0 for no, 1 for yes.
CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT = 1
## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_ACTIVATION -- send a
## notification email to the user when a new account is activated?
## Use 0 for no, 1 for yes.
CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_ACTIVATION = 0
## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_DELETION -- send a
## notification email to the user when a new account is deleted or
## account demand rejected? Use 0 for no, 1 for yes.
CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_DELETION = 0
## CFG_APACHE_PASSWORD_FILE -- the file where Apache user credentials
## are stored. Must be an absolute pathname. If the value does not
## start by a slash, it is considered to be the filename of a file
## located under prefix/var/tmp directory. This is useful for the
## demo site testing purposes. For the production site, if you plan
## to restrict access to some collections based on the Apache user
## authentication mechanism, you should put here an absolute path to
## your Apache password file.
CFG_APACHE_PASSWORD_FILE = demo-site-apache-user-passwords
## CFG_APACHE_GROUP_FILE -- the file where Apache user groups are
## defined. See the documentation of the preceding config variable.
CFG_APACHE_GROUP_FILE = demo-site-apache-user-groups
###################################
## Part 8: WebSession parameters ##
###################################
## This section contains some configuration parameters for tweaking
## session handling.
## CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT -- number of days after which a session
## and the corresponding cookie is considered expired.
CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT = 2
## CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER -- number of days after which a session
## and the corresponding cookie is considered expired, when the user has
## requested to permanently stay logged in.
CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER = 365
## CFG_WEBSESSION_RESET_PASSWORD_EXPIRE_IN_DAYS -- when user requested
## a password reset, for how many days is the URL valid?
CFG_WEBSESSION_RESET_PASSWORD_EXPIRE_IN_DAYS = 3
## CFG_WEBSESSION_ADDRESS_ACTIVATION_EXPIRE_IN_DAYS -- when an account
## activation email was sent, for how many days is the URL valid?
CFG_WEBSESSION_ADDRESS_ACTIVATION_EXPIRE_IN_DAYS = 3
## CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS -- when
## user won't confirm his email address and not complete
## registeration, after how many days will it expire?
CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS = 10
## CFG_WEBSESSION_DIFFERENTIATE_BETWEEN_GUESTS -- when set to 1, the session
## system allocates the same uid=0 to all guests users regardless of where they
## come from. 0 allocate a unique uid to each guest.
CFG_WEBSESSION_DIFFERENTIATE_BETWEEN_GUESTS = 0
################################
## Part 9: BibRank parameters ##
################################
## This section contains some configuration parameters for the ranking
## system.
## CFG_BIBRANK_SHOW_READING_STATS -- do we want to show reading
## similarity stats? ('People who viewed this page also viewed')
CFG_BIBRANK_SHOW_READING_STATS = 1
## CFG_BIBRANK_SHOW_DOWNLOAD_STATS -- do we want to show the download
## similarity stats? ('People who downloaded this document also
## downloaded')
CFG_BIBRANK_SHOW_DOWNLOAD_STATS = 1
## CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS -- do we want to show download
## history graph?
CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS = 1
## CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION -- do we
## want to show a graph representing the distribution of client IPs
## downloading given document?
CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION = 0
## CFG_BIBRANK_SHOW_CITATION_LINKS -- do we want to show the 'Cited
## by' links? (useful only when you have citations in the metadata)
CFG_BIBRANK_SHOW_CITATION_LINKS = 1
## CFG_BIBRANK_SHOW_CITATION_STATS -- de we want to show citation
## stats? ('Cited by M recors', 'Co-cited with N records')
CFG_BIBRANK_SHOW_CITATION_STATS = 1
## CFG_BIBRANK_SHOW_CITATION_GRAPHS -- do we want to show citation
## history graph?
CFG_BIBRANK_SHOW_CITATION_GRAPHS = 1
####################################
## Part 10: WebComment parameters ##
####################################
## This section contains some configuration parameters for the
## commenting and reviewing facilities.
## CFG_WEBCOMMENT_ALLOW_COMMENTS -- do we want to allow users write
## public comments on records?
CFG_WEBCOMMENT_ALLOW_COMMENTS = 1
## CFG_WEBCOMMENT_ALLOW_REVIEWS -- do we want to allow users write
## public reviews of records?
CFG_WEBCOMMENT_ALLOW_REVIEWS = 1
## CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS -- do we want to allow short
## reviews, that is just the attribution of stars without submitting
## detailed review text?
CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS = 0
## CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN -- if users
## report a comment to be abusive, how many they have to be before the
## site admin is alerted?
CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN = 5
## CFG_WEBCOMMENT_NB_COMMENTS_IN_DETAILED_VIEW -- how many comments do
## we display in the detailed record page upon welcome?
CFG_WEBCOMMENT_NB_COMMENTS_IN_DETAILED_VIEW = 1
## CFG_WEBCOMMENT_NB_REVIEWS_IN_DETAILED_VIEW -- how many reviews do
## we display in the detailed record page upon welcome?
CFG_WEBCOMMENT_NB_REVIEWS_IN_DETAILED_VIEW = 1
## CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL -- do we notify the site
## admin after every comment?
CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL = 1
## CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS -- how many
## elapsed seconds do we consider enough when checking for possible
## multiple comment submissions by a user?
CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS = 20
## CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS -- how many
## elapsed seconds do we consider enough when checking for possible
## multiple review submissions by a user?
CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS = 20
## CFG_WEBCOMMENT_USE_RICH_EDITOR -- enable the WYSIWYG
## Javascript-based editor when user edits comments?
CFG_WEBCOMMENT_USE_RICH_TEXT_EDITOR = False
## CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL -- the email address from which the
## alert emails will appear to be sent:
CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = cds.alert@cdsdev.cern.ch
##################################
## Part 11: BibSched parameters ##
##################################
## This section contains some configuration parameters for the
## bibliographic task scheduler.
## CFG_BIBSCHED_REFRESHTIME -- how often do we want to refresh
## bibsched monitor? (in seconds)
CFG_BIBSCHED_REFRESHTIME = 5
## CFG_BIBSCHED_LOG_PAGER -- what pager to use to view bibsched task
## logs?
CFG_BIBSCHED_LOG_PAGER = /bin/more
## CFG_BIBSCHED_GC_TASKS_OLDER_THAN -- after how many days to perform the
## gargbage collector of BibSched queue (i.e. removing/moving task to archive).
CFG_BIBSCHED_GC_TASKS_OLDER_THAN = 30
## CFG_BIBSCHED_GC_TASKS_TO_REMOVE -- list of BibTask that can be safely
## removed from the BibSched queue once they are DONE.
CFG_BIBSCHED_GC_TASKS_TO_REMOVE = bibindex,bibreformat,webcoll,bibrank,inveniogc
## CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE -- list of BibTasks that should be safely
## archived out of the BibSched queue once they are DONE.
CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE = bibupload,oaiarchive
## CFG_BIBSCHED_MAX_NUMBER_CONCURRENT_TASKS -- maximum number of BibTasks
## that can run concurrently.
## NOTE: concurrent tasks are still considered as an experimental
## feature. Please keep this value set to 1 on production environments.
CFG_BIBSCHED_MAX_NUMBER_CONCURRENT_TASKS = 1
## CFG_BIBSCHED_PROCESS_USER -- bibsched and bibtask processes must
## usually run under the same identity as the Apache web server
## process in order to share proper file read/write privileges. If
## you want to force some other bibsched/bibtask user, e.g. because
## you are using a local `invenio' user that belongs to your
## `www-data' Apache user group and so shares writing rights with your
## Apache web server process in this way, then please set its username
## identity here. Otherwise we shall check whether your
## bibsched/bibtask processes are run under the same identity as your
## Apache web server process (in which case you can leave the default
## empty value here).
CFG_BIBSCHED_PROCESS_USER =
###################################
## Part 12: WebBasket parameters ##
###################################
## CFG_WEBBASKET_MAX_NUMBER_OF_DISPLAYED_BASKETS -- a safety limit for
## a maximum number of displayed baskets
CFG_WEBBASKET_MAX_NUMBER_OF_DISPLAYED_BASKETS = 20
## CFG_WEBBASKET_USE_RICH_TEXT_EDITOR -- enable the WYSIWYG
## Javascript-based editor when user edits comments in WebBasket?
CFG_WEBBASKET_USE_RICH_TEXT_EDITOR = False
##################################
## Part 13: WebAlert parameters ##
##################################
## This section contains some configuration parameters for the
## automatic email notification alert system.
## CFG_WEBALERT_ALERT_ENGINE_EMAIL -- the email address from which the
## alert emails will appear to be sent:
CFG_WEBALERT_ALERT_ENGINE_EMAIL = cds.alert@cdsdev.cern.ch
## CFG_WEBALERT_MAX_NUM_OF_RECORDS_IN_ALERT_EMAIL -- how many records
## at most do we send in an outgoing alert email?
CFG_WEBALERT_MAX_NUM_OF_RECORDS_IN_ALERT_EMAIL = 20
## CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL -- number of
## chars per line in an outgoing alert email?
CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL = 72
## CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES -- when sending alert
## emails fails, how many times we retry?
CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES = 3
## CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES -- when sending
## alert emails fails, what is the sleeptime between tries? (in
## seconds)
CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES = 300
####################################
## Part 14: WebMessage parameters ##
####################################
## CFG_WEBMESSAGE_MAX_SIZE_OF_MESSAGE -- how large web messages do we
## allow?
CFG_WEBMESSAGE_MAX_SIZE_OF_MESSAGE = 20000
## CFG_WEBMESSAGE_MAX_NB_OF_MESSAGES -- how many messages for a
## regular user do we allow in its inbox?
CFG_WEBMESSAGE_MAX_NB_OF_MESSAGES = 30
## CFG_WEBMESSAGE_DAYS_BEFORE_DELETE_ORPHANS -- how many days before
## we delete orphaned messages?
CFG_WEBMESSAGE_DAYS_BEFORE_DELETE_ORPHANS = 60
##################################
## Part 15: MiscUtil parameters ##
##################################
## CFG_MISCUTIL_SQL_MAX_CACHED_QUERIES -- maximum number of cached SQL
## queries possible. After reaching this number the cache is pruned
## by deleting half of the older queries.
CFG_MISCUTIL_SQL_MAX_CACHED_QUERIES = 10000
## CFG_MISCUTIL_SQL_USE_SQLALCHEMY -- whether to use SQLAlchemy.pool
## in the DB engine of CDS Invenio. It is okay to enable this flag
## even if you have not installed SQLAlchemy. Note that Invenio will
## loose some perfomance if this option is enabled.
CFG_MISCUTIL_SQL_USE_SQLALCHEMY = False
## CFG_MISCUTIL_SQL_RUN_SQL_MANY_LIMIT -- how many queries can we run
## inside run_sql_many() in one SQL statement? The limit value
## depends on MySQL's max_allowed_packet configuration.
CFG_MISCUTIL_SQL_RUN_SQL_MANY_LIMIT = 10000
## CFG_MISCUTIL_SMTP_HOST -- which server to use as outgoing mail server to
## send outgoing emails generated by the system, for example concerning
## submissions or email notification alerts.
CFG_MISCUTIL_SMTP_HOST = localhost
## CFG_MISCUTIL_SMTP_PORT -- which port to use on the outgoing mail server
## defined in the previous step.
CFG_MISCUTIL_SMTP_PORT = 25
#################################
## Part 16: BibEdit parameters ##
#################################
## CFG_BIBEDIT_TIMEOUT -- when a user edits a record, this record is
## locked to prevent other users to edit it at the same time.
## How many seconds of inactivity before the locked record again will be free
## for other people to edit?
CFG_BIBEDIT_TIMEOUT = 3600
## CFG_BIBEDIT_LOCKLEVEL -- when a user tries to edit a record which there
## is a pending bibupload task for in the queue, this shouldn't be permitted.
## The lock level determines how thouroughly the queue should be investigated
## to determine if this is the case.
## Level 0 - always permits editing, doesn't look at the queue
## (unsafe, use only if you know what you are doing)
## Level 1 - permits editing if there are no queued bibedit tasks for this record
## (safe with respect to bibedit, but not for other bibupload maintenance jobs)
## Level 2 - permits editing if there are no queued bibupload tasks of any sort
## (safe, but may lock more than necessary if many cataloguers around)
## Level 3 - permits editing if no queued bibupload task concerns given record
## (safe, most precise locking, but slow,
## checks for 001/EXTERNAL_SYSNO_TAG/EXTERNAL_OAIID_TAG)
## The recommended level is 3 (default) or 2 (if you use maintenance jobs often).
CFG_BIBEDIT_LOCKLEVEL = 3
## CFG_BIBEDIT_PROTECTED_FIELDS -- a comma-separated list of fields that BibEdit
## will not allow to be added, edited or deleted. Wildcards are not supported,
## but conceptually a wildcard is added at the end of every field specification.
## Examples:
## 500A - protect all MARC fields with tag 500 and first indicator A
## 5 - protect all MARC fields in the 500-series.
## 909C_a - protect subfield a in tag 909 with first indicator C and empty
## second indicator
## Note that 001 is protected by default, but if protection of other
## identifiers or automated fields is a requirement, they should be added to
## this list.
CFG_BIBEDIT_PROTECTED_FIELDS =
###################################
## Part 17: BibUpload parameters ##
###################################
## CFG_BIBUPLOAD_REFERENCE_TAG -- where do we store references?
CFG_BIBUPLOAD_REFERENCE_TAG = 999
## CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG -- where do we store external
## system numbers? Useful for matching when our records come from an
## external digital library system.
CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG = 970__a
## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG -- where do we store OAI ID tags
## of harvested records? Useful for matching when we harvest stuff
## via OAI that we do not want to reexport via Invenio OAI; so records
## may have only the source OAI ID stored in this tag (kind of like
## external system number too).
CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG = 035__a
## CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG -- where do we store OAI SRC
## tags of harvested records? Useful for matching when we harvest stuff
## via OAI that we do not want to reexport via Invenio OAI; so records
## may have only the source OAI SRC stored in this tag (kind of like
## external system number too). Note that the field should be the same of
## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG.
CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG = 035__9
## CFG_BIBUPLOAD_STRONG_TAGS -- a comma-separated list of tags that
## are strong enough to resist the replace mode. Useful for tags that
## might be created from an external non-metadata-like source,
## e.g. the information about the number of copies left.
CFG_BIBUPLOAD_STRONG_TAGS = 964
## CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS -- a comma-separated list
## of tags that contain provenance information that should be checked
## in the bibupload correct mode via matching provenance codes. (Only
## field instances of the same provenance information would be acted
## upon.) Please specify the whole tag info up to subfield codes.
CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS = 6531_9
## CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS -- a comma-separated list of system
## paths from which it is allowed to take fulltextes that will be uploaded via
## FFT (CFG_TMPDIR is included by default).
CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS = /tmp,/home
## CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE -- do we want to serialize
## internal representation of records (Pythonic record structure) into
## the database? This can improve internal processing speed of some
## operations at the price of somewhat bigger disk space usage.
## If you change this value after some records have already been added
## to your installation, you may want to run:
## $ /opt/cds-invenio/bin/inveniocfg --reset-recstruct-cache
## in order to either erase the cache thus freeing database space,
## or to fill the cache for all records that have not been cached yet.
CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE = 1
####################################
## Part 18: BibCatalog parameters ##
####################################
## EXPERIMENTAL: Please do not use.
CFG_BIBCATALOG_SYSTEM =
CFG_BIBCATALOG_SYSTEM_RT_CLI = /usr/bin/rt
CFG_BIBCATALOG_SYSTEM_RT_URL = http://localhost/rt3
CFG_BIBCATALOG_QUEUES = General
+####################################
+## Part 19: BibFormat parameters ##
+####################################
+
+## CFG_BIBFORMAT_HIDDEN_TAGS -- comma-separated list of MARC tags that
+## are not shown to users not having cataloging authorizations.
+CFG_BIBFORMAT_HIDDEN_TAGS = 595
+
##########################
## THAT's ALL, FOLKS! ##
##########################
diff --git a/modules/bibformat/lib/bibformat.py b/modules/bibformat/lib/bibformat.py
index 38fd7255a..8814b1838 100644
--- a/modules/bibformat/lib/bibformat.py
+++ b/modules/bibformat/lib/bibformat.py
@@ -1,491 +1,537 @@
# -*- coding: utf-8 -*-
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Format records using specified format.
API functions: format_record, format_records, create_excel,
get_output_format_content_type
Used to wrap the BibFormat engine and associated functions. This is
also where special formatting of multiple records (that the engine
does not handle, as it works on a single record basis) should be put,
with name create_*.
SEE: bibformat_utils.py
FIXME: currently copies record_exists() code from search engine. Refactor later.
"""
__revision__ = "$Id$"
import zlib
from invenio import bibformat_dblayer
from invenio import bibformat_engine
from invenio import bibformat_utils
from invenio.errorlib import register_exception
from invenio.config import \
CFG_SITE_LANG, \
CFG_PATH_PHP, \
- CFG_SITE_URL
+ CFG_SITE_URL, \
+ CFG_BIBFORMAT_HIDDEN_TAGS
from invenio.bibformat_config import \
CFG_BIBFORMAT_USE_OLD_BIBFORMAT, \
CFG_BIBFORMAT_ENABLE_I18N_BRIEF_FORMAT
+from invenio.access_control_engine import acc_authorize_action
try:
import invenio.template
websearch_templates = invenio.template.load('websearch')
except:
pass
import getopt
import sys
# Functions to format a single record
##
+def filter_hidden_fields(recxml, user_info=None, filter_tags=CFG_BIBFORMAT_HIDDEN_TAGS,
+ force_filtering=False):
+ """
+ Filter out tags specified by filter_tags from MARCXML. If the user
+ is allowed to run bibedit, then filter nothing, unless
+ force_filtering is set to True.
+
+ @param recxml: marcxml presentation of the record
+ @param user_info: user information; if None, then assume invoked via CLI with all rights
+ @param filter_tags: list of MARC tags to be filtered
+ @param force_filtering: do we force filtering regardless of user rights?
+ @return: recxml without the hidden fields
+ """
+ if force_filtering:
+ pass
+ else:
+ if user_info is None:
+ #by default
+ return recxml
+ else:
+ if (acc_authorize_action(user_info, 'runbibedit')[0] == 0):
+ #no need to filter
+ return recxml
+ #filter..
+ lines = recxml.split("\n")
+ out = ""
+ omit = False
+ for line in lines:
+ #check if this block needs to be omitted
+ for htag in filter_tags:
+ if line.count('datafield tag="'+str(htag)+'"'):
+ omit = True
+ if not omit:
+ out += line
+ out += "\n"
+ if omit and line.count(''):
+ omit = False
+ return out
+
def format_record(recID, of, ln=CFG_SITE_LANG, verbose=0, search_pattern=None,
xml_record=None, user_info=None, on_the_fly=False):
"""
Formats a record given output format.
Returns a formatted version of the record in the specified
language, search pattern, and with the specified output format.
The function will define which format template must be applied.
The record to be formatted can be specified with its ID (with
'recID' parameter) or given as XML representation(with
'xml_record' parameter). If both are specified 'recID' is ignored.
'user_info' allows to grant access to some functionalities on a
page depending on the user's priviledges. The 'user_info' object
makes sense only in the case of on-the-fly formatting. 'user_info'
is the same object as the one returned by
'webuser.collect_user_info(req)'
@param recID: the ID of record to format
@param of: an output format code (or short identifier for the output format)
@param ln: the language to use to format the record
@param verbose: the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings, stop if error in format elements
9: errors and warnings, stop if error (debug mode ))
@param search_pattern: list of strings representing the user request in web interface
@param xml_record: an xml string represention of the record to format
@param user_info: the information of the user who will view the formatted page (if applicable)
@param on_the_fly: if False, try to return an already preformatted version of the record in the database
@return: formatted record
"""
if search_pattern is None:
search_pattern = []
out = ""
+
if verbose == 9:
out += """\n
Formatting record %i with output format %s.
""" % (recID, of)
############### FIXME: REMOVE WHEN MIGRATION IS DONE ###############
if CFG_BIBFORMAT_USE_OLD_BIBFORMAT and CFG_PATH_PHP:
return bibformat_engine.call_old_bibformat(recID, format=of, on_the_fly=on_the_fly)
############################# END ##################################
if not on_the_fly and \
(ln == CFG_SITE_LANG or \
of.lower() == 'xm' or \
CFG_BIBFORMAT_USE_OLD_BIBFORMAT or \
(CFG_BIBFORMAT_ENABLE_I18N_BRIEF_FORMAT == False and of.lower() == 'hb')):
# Try to fetch preformatted record
# Only possible for records formatted in CFG_SITE_LANG
# language (other are never stored), or of='xm' which does not
# depend on language.
# Also, when formatting in HB, and when
# CFG_BIBFORMAT_ENABLE_I18N_BRIEF_FORMAT is set to False,
# ignore other languages and fetch the preformatted output.
res = bibformat_dblayer.get_preformatted_record(recID, of)
if res is not None:
# record 'recID' is formatted in 'of', so return it
if verbose == 9:
last_updated = bibformat_dblayer.get_preformatted_record_date(recID, of)
out += """\n
Found preformatted output for record %i (cache updated on %s).
""" % (recID, last_updated)
+ if of.lower() == 'xm':
+ res = filter_hidden_fields(res, user_info)
out += res
return out
else:
if verbose == 9:
out += """\n
No preformatted output found for record %s.
"""% recID
# Live formatting of records in all other cases
if verbose == 9:
out += """\n
Formatting record %i on-the-fly.
""" % recID
try:
out += bibformat_engine.format_record(recID=recID,
of=of,
ln=ln,
verbose=verbose,
search_pattern=search_pattern,
xml_record=xml_record,
user_info=user_info)
+ if of.lower() == 'xm':
+ out = filter_hidden_fields(out, user_info)
return out
except Exception, e:
register_exception(prefix="An error occured while formatting record %i in %s" % \
(recID, of),
alert_admin=True)
#Failsafe execution mode
if verbose == 9:
out += """\n
An error occured while formatting record %i. (%s)
""" % (recID, str(e))
if of.lower() == 'hd':
if verbose == 9:
out += """\n
Formatting record %i with websearch_templates.tmpl_print_record_detailed.
""" % recID
return out + websearch_templates.tmpl_print_record_detailed(
ln = ln,
recID = recID,
)
if verbose == 9:
out += """\n
Formatting record %i with websearch_templates.tmpl_print_record_brief.
""" % recID
return out + websearch_templates.tmpl_print_record_brief(ln = ln,
recID = recID,
)
def record_get_xml(recID, format='xm', decompress=zlib.decompress):
"""
Returns an XML string of the record given by recID.
The function builds the XML directly from the database,
without using the standard formatting process.
'format' allows to define the flavour of XML:
- 'xm' for standard XML
- 'marcxml' for MARC XML
- 'oai_dc' for OAI Dublin Core
- 'xd' for XML Dublin Core
If record does not exist, returns empty string.
@param recID: the id of the record to retrieve
@return: the xml string of the record
"""
return bibformat_utils.record_get_xml(recID=recID, format=format, decompress=decompress)
# Helper functions to do complex formatting of multiple records
#
# You should not modify format_records when adding a complex
# formatting of multiple records, but add a create_* method
# that relies on format_records to do the formatting.
##
def format_records(recIDs, of, ln=CFG_SITE_LANG, verbose=0, search_pattern=None,
xml_records=None, user_info=None, record_prefix=None,
record_separator=None, record_suffix=None, prologue="",
epilogue="", req=None, on_the_fly=False):
"""
Returns a list of formatted records given by a list of record IDs
or a list of records as xml. Adds a prefix before each record, a
suffix after each record, plus a separator between records.
Also add optional prologue and epilogue to the complete formatted
list.
You can either specify a list of record IDs to format, or a list
of xml records, but not both (if both are specified recIDs is
ignored).
'record_separator' is a function that returns a string as
separator between records. The function must take an integer as
unique parameter, which is the index in recIDs (or xml_records) of
the record that has just been formatted. For example separator(i)
must return the separator between recID[i] and recID[i+1].
Alternatively separator can be a single string, which will be used
to separate all formatted records. The same applies to
'record_prefix' and 'record_suffix'.
'req' is an optional parameter on which the result of the function
are printed lively (prints records after records) if it is given.
Note that you should set 'req' content-type by yourself, and send
http header before calling this function as it will not do it.
This function takes the same parameters as 'format_record' except for:
@param recIDs: a list of record IDs
@param xml_records: a list of xml string representions of the records to format
@param header: a string printed before all formatted records
@param separator: either a string or a function that returns string to separate formatted records
@param req: an optional request object where to print records
@param on_the_fly: if False, try to return an already preformatted version of the record in the database
"""
if req is not None:
req.write(prologue)
formatted_records = ''
#Fill one of the lists with Nones
if xml_records is not None:
recIDs = map(lambda x:None, xml_records)
else:
xml_records = map(lambda x:None, recIDs)
total_rec = len(recIDs)
last_iteration = False
for i in range(total_rec):
if i == total_rec - 1:
last_iteration = True
#Print prefix
if record_prefix is not None:
if isinstance(record_prefix, str):
formatted_records += record_prefix
if req is not None:
req.write(record_prefix)
else:
string_prefix = record_prefix(i)
formatted_records += string_prefix
if req is not None:
req.write(string_prefix)
#Print formatted record
formatted_record = format_record(recIDs[i], of, ln, verbose, \
search_pattern, xml_records[i],\
user_info, on_the_fly)
formatted_records += formatted_record
if req is not None:
req.write(formatted_record)
#Print suffix
if record_suffix is not None:
if isinstance(record_suffix, str):
formatted_records += record_suffix
if req is not None:
req.write(record_suffix)
else:
string_suffix = record_suffix(i)
formatted_records += string_suffix
if req is not None:
req.write(string_suffix)
#Print separator if needed
if record_separator is not None and not last_iteration:
if isinstance(record_separator, str):
formatted_records += record_separator
if req is not None:
req.write(record_separator)
else:
string_separator = record_separator(i)
formatted_records += string_separator
if req is not None:
req.write(string_separator)
if req is not None:
req.write(epilogue)
return prologue + formatted_records + epilogue
def create_excel(recIDs, req=None, ln=CFG_SITE_LANG, ot=None, ot_sep="; "):
"""
Returns an Excel readable format containing the given recIDs.
If 'req' is given, also prints the output in 'req' while individual
records are being formatted.
This method shows how to create a custom formatting of multiple
records.
The excel format is a basic HTML table that most spreadsheets
applications can parse.
If 'ot' is given, the BibFormat engine is overridden and the
output is produced on the basis of the fields that 'ot' defines
(see search_engine.perform_request_search(..) 'ot' param).
@param recIDs: a list of record IDs
@param ot: a list of fields that should be included in the excel output as columns(see perform_request_search 'ot' param)
@param ot_sep: a separator used to separate values for the same record, in the same columns, if any
@return: a string in Excel format
"""
# Prepare the column headers to display in the Excel file
column_headers_list = ['Title',
'Authors',
'Addresses',
'Affiliation',
'Date',
'Publisher',
'Place',
'Abstract',
'Keywords',
'Notes']
# Prepare Content
column_headers = '
'
# Apply content_type and print column headers
if req is not None:
req.content_type = get_output_format_content_type('excel')
req.headers_out["Content-Disposition"] = "inline; filename=%s" % 'results.xls'
req.send_http_header()
if ot is not None and len(ot) > 0:
# Skip BibFormat engine, produce our own output based on
# specified fields. Each field will be a column of the
# output. If a field has multiple values, then they are joined
# into the same cell.
out = "
')
return out
#Format the records
excel_formatted_records = format_records(recIDs, 'excel', ln=CFG_SITE_LANG,
record_separator='\n',
prologue = '
',
epilogue = footer,
req=req)
return excel_formatted_records
# Utility functions
##
def get_output_format_content_type(of):
"""
Returns the content type (eg. 'text/html' or 'application/ms-excel') \
of the given output format.
@param of: the code of output format for which we want to get the content type
"""
content_type = bibformat_dblayer.get_output_format_content_type(of)
if content_type == '':
content_type = 'text/html'
return content_type
def usage(exitcode=1, msg=""):
"""Prints usage info."""
if msg:
sys.stderr.write("Error: %s.\n" % msg)
print """BibFormat: outputs the result of the formatting of a record.
Usage: bibformat required [options]
Examples:
$ bibformat -i 10 -o HB
$ bibformat -i 10,11,13 -o HB
$ bibformat -i 10:13
$ bibformat -i 10 -o HB -v 9
Required:
-i, --id=ID[ID2,ID3:ID5] ID (or range of IDs) of the record(s) to be formatted.
Options:
-o, --output=CODE short code of the output format used for formatting (default HB).
-l, --lang=LN language used for formatting.
-y, --onthefly on-the-fly formatting, avoiding caches created by BibReformat.
General options:
-h, --help print this help and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 0)
-V --version print the script version
"""
sys.exit(exitcode)
def main():
"""main entry point for biformat via command line"""
options = {} # will hold command-line options
options["verbose"] = 0
options["onthefly"] = False
options["lang"] = CFG_SITE_LANG
options["output"] = "HB"
options["recID"] = None
try:
opts, args = getopt.getopt(sys.argv[1:],
"hVv:yl:i:o:",
["help",
"version",
"verbose=",
"onthefly",
"lang=",
"id=",
"output="])
except getopt.GetoptError, err:
usage(1, err)
pass
try:
for opt in opts:
if opt[0] in ["-h", "--help"]:
usage(0)
elif opt[0] in ["-V", "--version"]:
print __revision__
sys.exit(0)
elif opt[0] in ["-v", "--verbose"]:
options["verbose"] = int(opt[1])
elif opt[0] in ["-y", "--onthefly"]:
options["onthefly"] = True
elif opt[0] in ["-l", "--lang"]:
options["lang"] = opt[1]
elif opt[0] in ["-i", "--id"]:
recIDs = []
for recID in opt[1].split(','):
if ":" in recID:
start = int(recID.split(':')[0])
end = int(recID.split(':')[1])
recIDs.extend(range(start, end))
else:
recIDs.append(int(recID))
options["recID"] = recIDs
elif opt[0] in ["-o", "--output"]:
options["output"] = opt[1]
if options["recID"] == None:
usage(1, "-i argument is needed")
except StandardError, e:
usage(e)
print format_records(recIDs=options["recID"],
of=options["output"],
ln=options["lang"],
verbose=options["verbose"],
on_the_fly=options["onthefly"])
return
if __name__ == "__main__":
main()
diff --git a/modules/bibformat/lib/bibformat_engine.py b/modules/bibformat/lib/bibformat_engine.py
index f829054d1..360f10b78 100644
--- a/modules/bibformat/lib/bibformat_engine.py
+++ b/modules/bibformat/lib/bibformat_engine.py
@@ -1,2042 +1,2051 @@
# -*- coding: utf-8 -*-
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Formats a single XML Marc record using specified format.
There is no API for the engine. Instead use bibformat.py.
SEE: bibformat.py, bibformat_utils.py
"""
__revision__ = "$Id$"
import re
import sys
import os
import inspect
import traceback
import zlib
import cgi
from invenio.config import \
CFG_PATH_PHP, \
CFG_BINDIR, \
CFG_SITE_LANG
from invenio.errorlib import \
register_errors, \
get_msgs_for_code_list
from invenio.bibrecord import \
create_record, \
record_get_field_instances, \
record_get_field_value, \
record_get_field_values
from invenio.bibformat_xslt_engine import format
from invenio.dbquery import run_sql
from invenio.messages import \
language_list_long, \
wash_language, \
gettext_set_language
from invenio import bibformat_dblayer
from invenio.bibformat_config import \
CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION, \
CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION, \
CFG_BIBFORMAT_TEMPLATES_PATH, \
CFG_BIBFORMAT_ELEMENTS_PATH, \
CFG_BIBFORMAT_OUTPUTS_PATH, \
CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH
from invenio.bibformat_utils import \
record_get_xml, \
parse_tag
from invenio.htmlutils import \
HTMLWasher, \
cfg_html_buffer_allowed_tag_whitelist, \
cfg_html_buffer_allowed_attribute_whitelist
from invenio.webuser import collect_user_info
from invenio.bibknowledge import get_kbr_values
from HTMLParser import HTMLParseError
if CFG_PATH_PHP: #Remove when call_old_bibformat is removed
from xml.dom import minidom
import tempfile
# Cache for data we have already read and parsed
format_templates_cache = {}
format_elements_cache = {}
format_outputs_cache = {}
html_field = '' # String indicating that field should be
# treated as HTML (and therefore no escaping of
# HTML tags should occur.
# Appears in some field values.
washer = HTMLWasher() # Used to remove dangerous tags from HTML
# sources
# Regular expression for finding ... tag in format templates
pattern_lang = re.compile(r'''
#closing start tag
(?P.*?) #anything but the next group (greedy)
() #end tag
''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
# Builds regular expression for finding each known language in tags
ln_pattern_text = r"<("
for lang in language_list_long(enabled_langs_only=False):
ln_pattern_text += lang[0] +r"|"
ln_pattern_text = ln_pattern_text.rstrip(r"|")
ln_pattern_text += r")>(.*?)\1>"
ln_pattern = re.compile(ln_pattern_text, re.IGNORECASE | re.DOTALL)
# Regular expression for finding text to be translated
translation_pattern = re.compile(r'_\((?P.*?)\)_', \
re.IGNORECASE | re.DOTALL | re.VERBOSE)
# Regular expression for finding tag in format templates
pattern_format_template_name = re.compile(r'''
#closing start tag
(?P.*?) #name value. any char that is not end tag
()(\n)? #end tag
''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
# Regular expression for finding tag in format templates
pattern_format_template_desc = re.compile(r'''
#closing start tag
(?P.*?) #description value. any char that is not end tag
(\n)? #end tag
''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
# Regular expression for finding tags in format templates
pattern_tag = re.compile(r'''
[^/\s]+) #any char but a space or slash
\s* #any number of spaces
(?P(\s* #params here
(?P([^=\s])*)\s* #param name: any chars that is not a white space or equality. Followed by space(s)
=\s* #equality: = followed by any number of spaces
(?P[\'"]) #one of the separators
(?P.*?) #param value: any chars that is not a separator like previous one
(?P=sep) #same separator as starting one
)*) #many params
\s* #any number of spaces
(/)?> #end of the tag
''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
# Regular expression for finding params inside tags in format templates
pattern_function_params = re.compile('''
(?P([^=\s])*)\s* # Param name: any chars that is not a white space or equality. Followed by space(s)
=\s* # Equality: = followed by any number of spaces
(?P[\'"]) # One of the separators
(?P.*?) # Param value: any chars that is not a separator like previous one
(?P=sep) # Same separator as starting one
''', re.VERBOSE | re.DOTALL )
# Regular expression for finding format elements "params" attributes
# (defined by @param)
pattern_format_element_params = re.compile('''
@param\s* # Begins with AT param keyword followed by space(s)
(?P[^\s=]*):\s* # A single keyword and comma, then space(s)
#(=\s*(?P[\'"]) # Equality, space(s) and then one of the separators
#(?P.*?) # Default value: any chars that is not a separator like previous one
#(?P=sep) # Same separator as starting one
#)?\s* # Default value for param is optional. Followed by space(s)
(?P.*) # Any text that is not end of line (thanks to MULTILINE parameter)
''', re.VERBOSE | re.MULTILINE)
# Regular expression for finding format elements "see also" attribute
# (defined by @see)
pattern_format_element_seealso = re.compile('''@see:\s*(?P.*)''',
re.VERBOSE | re.MULTILINE)
#Regular expression for finding 2 expressions in quotes, separated by
#comma (as in template("1st","2nd") )
#Used when parsing output formats
## pattern_parse_tuple_in_quotes = re.compile('''
## (?P[\'"])
## (?P.*)
## (?P=sep1)
## \s*,\s*
## (?P[\'"])
## (?P.*)
## (?P=sep2)
## ''', re.VERBOSE | re.MULTILINE)
def call_old_bibformat(recID, format="HD", on_the_fly=False, verbose=0):
"""
FIXME: REMOVE FUNCTION WHEN MIGRATION IS DONE
Calls BibFormat for the record RECID in the desired output format FORMAT.
@param on_the_fly: if False, try to return an already preformatted version of the record in the database
Note: this functions always try to return HTML, so when
bibformat returns XML with embedded HTML format inside the tag
FMT $g, as is suitable for prestoring output formats, we
perform un-XML-izing here in order to return HTML body only.
"""
+
out = ""
res = []
if not on_the_fly:
# look for formatted record existence:
query = "SELECT value, last_updated FROM bibfmt WHERE "\
"id_bibrec='%s' AND format='%s'" % (recID, format)
res = run_sql(query, None, 1)
if res:
# record 'recID' is formatted in 'format', so print it
if verbose == 9:
last_updated = res[0][1]
out += """\n
Found preformatted output for record %i (cache updated on %s).
""" % (recID, last_updated)
decompress = zlib.decompress
return "%s" % decompress(res[0][0])
else:
# record 'recID' is not formatted in 'format',
# so try to call BibFormat on the fly or use default format:
if verbose == 9:
out += """\n
Formatting record %i on-the-fly with old BibFormat.
""" % recID
# Retrieve MARCXML
# Build it on-the-fly only if 'call_old_bibformat' was called
# with format=xm and on_the_fly=True
xm_record = record_get_xml(recID, 'xm',
on_the_fly=(on_the_fly and format == 'xm'))
## import platform
-## # Some problem have been found using either popen or os.system command.
+## # Some problem have been found using either popen() or os.system().
## # Here is a temporary workaround until the issue is solved.
## if platform.python_compiler().find('Red Hat') > -1:
## # use os.system
(result_code, result_path) = tempfile.mkstemp()
- command = "( %s/bibformat otype=%s ) > %s" % (CFG_BINDIR, format, result_path)
+ command = "( %s/bibformat otype=%s ) > %s" % \
+ (CFG_BINDIR, format, result_path)
(xm_code, xm_path) = tempfile.mkstemp()
xm_file = open(xm_path, "w")
xm_file.write(xm_record)
xm_file.close()
command = command + " <" + xm_path
os.system(command)
result_file = open(result_path,"r")
bibformat_output = result_file.read()
result_file.close()
os.close(result_code)
os.remove(result_path)
os.close(xm_code)
os.remove(xm_path)
## else:
## # use popen
## pipe_input, pipe_output, pipe_error = os.popen3(["%s/bibformat" % CFG_BINDIR,
-## "otype=%s" % format],
+## "otype=%s" % format],
## 'rw')
## pipe_input.write(xm_record)
## pipe_input.flush()
## pipe_input.close()
## bibformat_output = pipe_output.read()
## pipe_output.close()
## pipe_error.close()
if bibformat_output.startswith(""):
dom = minidom.parseString(bibformat_output)
for e in dom.getElementsByTagName('subfield'):
if e.getAttribute('code') == 'g':
for t in e.childNodes:
out += t.data.encode('utf-8')
else:
out += bibformat_output
return out
def format_record(recID, of, ln=CFG_SITE_LANG, verbose=0,
search_pattern=None, xml_record=None, user_info=None):
"""
Formats a record given output format. Main entry function of
bibformat engine.
Returns a formatted version of the record in the specified
language, search pattern, and with the specified output format.
The function will define which format template must be applied.
You can either specify an record ID to format, or give its xml
representation. if 'xml_record' is not None, then use it instead
of recID.
'user_info' allows to grant access to some functionalities on a
page depending on the user's priviledges. 'user_info' is the same
object as the one returned by 'webuser.collect_user_info(req)'
@param recID: the ID of record to format
@param of: an output format code (or short identifier for the output format)
@param ln: the language to use to format the record
@param verbose: the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings, stop if error in format elements
9: errors and warnings, stop if error (debug mode ))
@param search_pattern: list of strings representing the user request in web interface
@param xml_record: an xml string representing the record to format
@param user_info: the information of the user who will view the formatted page
@return: formatted record
"""
if search_pattern is None:
search_pattern = []
out = ""
errors_ = []
# Temporary workflow (during migration of formats):
# Call new BibFormat
# But if format not found for new BibFormat, then call old BibFormat
#Create a BibFormat Object to pass that contain record and context
bfo = BibFormatObject(recID, ln, search_pattern, xml_record, user_info, of)
if of.lower() != 'xm' and \
(not bfo.get_record() or len(bfo.get_record()) <= 1):
# Record only has recid: do not format, excepted
# for xm format
return ""
#Find out which format template to use based on record and output format.
template = decide_format_template(bfo, of)
if verbose == 9 and template is not None:
out += """\n
Using %s template for record %i.
""" % (template, recID)
############### FIXME: REMOVE WHEN MIGRATION IS DONE ###############
path = "%s%s%s" % (CFG_BIBFORMAT_TEMPLATES_PATH, os.sep, template)
if template is None or not os.access(path, os.R_OK):
# template not found in new BibFormat. Call old one
if verbose == 9:
if template is None:
out += """\n
No template found for output format %s and record %i.
(Check invenio.err log file for more details)
""" % (of, recID)
else:
out += """\n
Template %s could not be read.
""" % (template)
if CFG_PATH_PHP:
if verbose == 9:
out += """\n
Using old BibFormat for record %s.
""" % recID
- return out + call_old_bibformat(recID, format=of, on_the_fly=True, verbose=verbose)
+ return out + call_old_bibformat(recID, format=of, on_the_fly=True,
+ verbose=verbose)
############################# END ##################################
error = get_msgs_for_code_list([("ERR_BIBFORMAT_NO_TEMPLATE_FOUND", of)],
stream='error', ln=CFG_SITE_LANG)
errors_.append(error)
if verbose == 0:
register_errors(error, 'error')
elif verbose > 5:
return out + error[0][1]
return out
# Format with template
(out_, errors) = format_with_format_template(template, bfo, verbose)
errors_.extend(errors)
out += out_
return out
def decide_format_template(bfo, of):
"""
Returns the format template name that should be used for formatting
given output format and BibFormatObject.
Look at of rules, and take the first matching one.
If no rule matches, returns None
To match we ignore lettercase and spaces before and after value of
rule and value of record
@param bfo: a BibFormatObject
@param of: the code of the output format to use
"""
output_format = get_output_format(of)
for rule in output_format['rules']:
if rule['field'].startswith('00'):
# Rule uses controlfield
value = bfo.control_field(rule['field']).strip() #Remove spaces
else:
# Rule uses datafield
value = bfo.field(rule['field']).strip() #Remove spaces
pattern = rule['value'].strip() #Remove spaces
match_obj = re.match(pattern, value, re.IGNORECASE)
if match_obj is not None and \
match_obj.start() == 0 and match_obj.end() == len(value):
return rule['template']
template = output_format['default']
if template != '':
return template
else:
return None
def format_with_format_template(format_template_filename, bfo,
verbose=0, format_template_code=None):
""" Format a record given a
format template. Also returns errors
Returns a formatted version of the record represented by bfo,
in the language specified in bfo, and with the specified format template.
If format_template_code is provided, the template will not be loaded from
format_template_filename (but format_template_filename will still be used to
determine if bft or xsl transformation applies). This allows to preview format
code without having to save file on disk.
@param format_template_filename: the dilename of a format template
@param bfo: the object containing parameters for the current formatting
@param format_template_code: if not empty, use code as template instead of reading format_template_filename (used for previews)
@param verbose: the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return: tuple (formatted text, errors)
"""
_ = gettext_set_language(bfo.lang)
def translate(match):
"""
Translate matching values
"""
word = match.group("word")
translated_word = _(word)
return translated_word
errors_ = []
if format_template_code is not None:
format_content = str(format_template_code)
else:
format_content = get_format_template(format_template_filename)['code']
if format_template_filename is None or \
format_template_filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION):
# .bft
filtered_format = filter_languages(format_content, bfo.lang)
localized_format = translation_pattern.sub(translate, filtered_format)
(evaluated_format, errors) = eval_format_template_elements(localized_format,
bfo,
verbose)
errors_ = errors
else:
#.xsl
# Fetch MARCXML. On-the-fly xm if we are now formatting in xm
+
xml_record = '\n' + \
record_get_xml(bfo.recID, 'xm', on_the_fly=False)
# Transform MARCXML using stylesheet
evaluated_format = format(xml_record, template_source=format_content)
return (evaluated_format, errors_)
def eval_format_template_elements(format_template, bfo, verbose=0):
"""
Evalutes the format elements of the given template and replace each element with its value.
Also returns errors.
Prepare the format template content so that we can directly replace the marc code by their value.
This implies: 1) Look for special tags
2) replace special tags by their evaluation
@param format_template: the format template code
@param bfo: the object containing parameters for the current formatting
@param verbose: the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return: tuple (result, errors)
"""
errors_ = []
# First define insert_element_code(match), used in re.sub() function
def insert_element_code(match):
"""
Analyses 'match', interpret the corresponding code, and return the result of the evaluation.
Called by substitution in 'eval_format_template_elements(...)'
@param match: a match object corresponding to the special tag that must be interpreted
"""
function_name = match.group("function_name")
try:
format_element = get_format_element(function_name, verbose)
except Exception, e:
if verbose >= 5:
return '' + \
cgi.escape(str(e)).replace('\n', ' ') + \
''
if format_element is None:
error = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_RESOLVE_ELEMENT_NAME", function_name)],
stream='error', ln=CFG_SITE_LANG)
errors_.append(error)
if verbose >= 5:
return '' + \
error[0][1]+''
else:
params = {}
# Look for function parameters given in format template code
all_params = match.group('params')
if all_params is not None:
function_params_iterator = pattern_function_params.finditer(all_params)
for param_match in function_params_iterator:
name = param_match.group('param')
value = param_match.group('value')
params[name] = value
# Evaluate element with params and return (Do not return errors)
(result, errors) = eval_format_element(format_element,
bfo,
params,
verbose)
errors_.append(errors)
return result
# Substitute special tags in the format by our own text.
# Special tags have the form
format = pattern_tag.sub(insert_element_code, format_template)
return (format, errors_)
def eval_format_element(format_element, bfo, parameters={}, verbose=0):
"""
Returns the result of the evaluation of the given format element
name, with given BibFormatObject and parameters. Also returns
the errors of the evaluation.
@param format_element: a format element structure as returned by get_format_element
@param bfo: a BibFormatObject used for formatting
@param parameters: a dict of parameters to be used for formatting. Key is parameter and value is value of parameter
@param verbose: the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return: tuple (result, errors)
"""
errors = []
#Load special values given as parameters
prefix = parameters.get('prefix', "")
suffix = parameters.get('suffix', "")
default_value = parameters.get('default', "")
escape = parameters.get('escape', "")
output_text = ''
# 3 possible cases:
# a) format element file is found: we execute it
# b) format element file is not found, but exist in tag table (e.g. bfe_isbn)
# c) format element is totally unknown. Do nothing or report error
if format_element is not None and format_element['type'] == "python":
# a) We found an element with the tag name, of type "python"
# Prepare a dict 'params' to pass as parameter to 'format'
# function of element
params = {}
# Look for parameters defined in format element
# Fill them with specified default values and values
# given as parameters.
# Also remember if the element overrides the 'escape'
# parameter
format_element_overrides_escape = False
for param in format_element['attrs']['params']:
name = param['name']
default = param['default']
params[name] = parameters.get(name, default)
if name == 'escape':
format_element_overrides_escape = True
# Add BibFormatObject
params['bfo'] = bfo
# Execute function with given parameters and return result.
function = format_element['code']
try:
output_text = apply(function, (), params)
except Exception, e:
name = format_element['attrs']['name']
error = ("ERR_BIBFORMAT_EVALUATING_ELEMENT", name, str(params))
errors.append(error)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
tb = sys.exc_info()[2]
error_string = get_msgs_for_code_list(error,
stream='error',
ln=CFG_SITE_LANG)
stack = traceback.format_exception(Exception, e, tb, limit=None)
output_text = ''+ \
str(error_string[0][1]) + "".join(stack) +' '
# None can be returned when evaluating function
if output_text is None:
output_text = ""
else:
output_text = str(output_text)
# Escaping:
# (1) By default, everything is escaped in mode 1
# (2) If evaluated element has 'escape_values()' function, use
# its returned value as escape mode, and override (1)
# (3) If template has a defined parameter 'escape' (in allowed
# values), use it, and override (1) and (2). If this
# 'escape' parameter is overriden by the format element
# (defined in the 'format' function of the element), leave
# the escaping job to this element
# (1)
escape_mode = 1
# (2)
escape_function = format_element['escape_function']
if escape_function is not None:
try:
escape_mode = apply(escape_function, (), {'bfo': bfo})
except Exception, e:
error = ("ERR_BIBFORMAT_EVALUATING_ELEMENT_ESCAPE", name)
errors.append(error)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
tb = sys.exc_info()[2]
error_string = get_msgs_for_code_list(error,
stream='error',
ln=CFG_SITE_LANG)
output_text += ''+ \
str(error_string[0][1]) +' '
# (3)
if escape in ['0', '1', '2', '3', '4', '5', '6']:
escape_mode = int(escape)
# If escape is equal to 1, then escape all
# HTML reserved chars.
if escape_mode > 0 and not format_element_overrides_escape:
output_text = escape_field(output_text, mode=escape_mode)
# Add prefix and suffix if they have been given as parameters and if
# the evaluation of element is not empty
if output_text.strip() != "":
output_text = prefix + output_text + suffix
# Add the default value if output_text is empty
if output_text == "":
output_text = default_value
return (output_text, errors)
elif format_element is not None and format_element['type'] == "field":
# b) We have not found an element in files that has the tag
# name. Then look for it in the table "tag"
#
#
#
# Load special values given as parameters
separator = parameters.get('separator ', "")
nbMax = parameters.get('nbMax', "")
escape = parameters.get('escape', "1") # By default, escape here
# Get the fields tags that have to be printed
tags = format_element['attrs']['tags']
output_text = []
# Get values corresponding to tags
for tag in tags:
p_tag = parse_tag(tag)
values = record_get_field_values(bfo.get_record(),
p_tag[0],
p_tag[1],
p_tag[2],
p_tag[3])
if len(values)>0 and isinstance(values[0], dict):
#flatten dict to its values only
values_list = map(lambda x: x.values(), values)
#output_text.extend(values)
for values in values_list:
output_text.extend(values)
else:
output_text.extend(values)
if nbMax != "":
try:
nbMax = int(nbMax)
output_text = output_text[:nbMax]
except:
name = format_element['attrs']['name']
error = ("ERR_BIBFORMAT_NBMAX_NOT_INT", name)
errors.append(error)
if verbose < 5:
register_errors(error, 'error')
elif verbose >= 5:
error_string = get_msgs_for_code_list(error,
stream='error',
ln=CFG_SITE_LANG)
output_text = output_text.append(error_string[0][1])
# Add prefix and suffix if they have been given as parameters and if
# the evaluation of element is not empty.
# If evaluation is empty string, return default value if it exists.
# Else return empty string
if ("".join(output_text)).strip() != "":
# If escape is equal to 1, then escape all
# HTML reserved chars.
if escape == '1':
output_text = cgi.escape(separator.join(output_text))
else:
output_text = separator.join(output_text)
output_text = prefix + output_text + suffix
else:
#Return default value
output_text = default_value
return (output_text, errors)
else:
# c) Element is unknown
error = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_RESOLVE_ELEMENT_NAME", format_element)],
stream='error', ln=CFG_SITE_LANG)
errors.append(error)
if verbose < 5:
register_errors(error, 'error')
return ("", errors)
elif verbose >= 5:
if verbose >= 9:
sys.exit(error[0][1])
return ('' + \
error[0][1]+'', errors)
def filter_languages(format_template, ln='en'):
"""
Filters the language tags that do not correspond to the specified language.
@param format_template: the format template code
@param ln: the language that is NOT filtered out from the template
@return: the format template with unnecessary languages filtered out
"""
# First define search_lang_tag(match) and clean_language_tag(match), used
# in re.sub() function
def search_lang_tag(match):
"""
Searches for the ... tag and remove inner localized tags
such as , , that are not current_lang.
If current_lang cannot be found inside ... , try to use 'CFG_SITE_LANG'
@param match: a match object corresponding to the special tag that must be interpreted
"""
current_lang = ln
def clean_language_tag(match):
"""
Return tag text content if tag language of match is output language.
Called by substitution in 'filter_languages(...)'
@param match: a match object corresponding to the special tag that must be interpreted
"""
if match.group(1) == current_lang:
return match.group(2)
else:
return ""
# End of clean_language_tag
lang_tag_content = match.group("langs")
# Try to find tag with current lang. If it does not exists,
# then current_lang becomes CFG_SITE_LANG until the end of this
# replace
pattern_current_lang = re.compile(r"<("+current_lang+ \
r")\s*>(.*?)("+current_lang+r"\s*>)", re.IGNORECASE | re.DOTALL)
if re.search(pattern_current_lang, lang_tag_content) is None:
current_lang = CFG_SITE_LANG
cleaned_lang_tag = ln_pattern.sub(clean_language_tag, lang_tag_content)
return cleaned_lang_tag
# End of search_lang_tag
filtered_format_template = pattern_lang.sub(search_lang_tag, format_template)
return filtered_format_template
def get_format_template(filename, with_attributes=False):
"""
Returns the structured content of the given formate template.
if 'with_attributes' is true, returns the name and description. Else 'attrs' is not
returned as key in dictionary (it might, if it has already been loaded previously)
{'code':"Some template code"
'attrs': {'name': "a name", 'description': "a description"}
}
@param filename: the filename of an format template
@param with_attributes: if True, fetch the attributes (names and description) for format'
@return: strucured content of format template
"""
+
# Get from cache whenever possible
global format_templates_cache
if not filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION) and \
not filename.endswith(".xsl"):
return None
if format_templates_cache.has_key(filename):
# If we must return with attributes and template exist in
# cache with attributes then return cache.
# Else reload with attributes
if with_attributes and \
format_templates_cache[filename].has_key('attrs'):
return format_templates_cache[filename]
format_template = {'code':""}
try:
path = "%s%s%s" % (CFG_BIBFORMAT_TEMPLATES_PATH, os.sep, filename)
format_file = open(path)
format_content = format_file.read()
format_file.close()
# Load format template code
# Remove name and description
if filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION):
code_and_description = pattern_format_template_name.sub("",
format_content, 1)
code = pattern_format_template_desc.sub("", code_and_description, 1)
else:
code = format_content
format_template['code'] = code
except Exception, e:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_READ_TEMPLATE_FILE", filename, str(e))],
stream='error', ln=CFG_SITE_LANG)
register_errors(errors, 'error')
# Save attributes if necessary
if with_attributes:
format_template['attrs'] = get_format_template_attrs(filename)
# Cache and return
format_templates_cache[filename] = format_template
return format_template
def get_format_templates(with_attributes=False):
"""
Returns the list of all format templates, as dictionary with filenames as keys
if 'with_attributes' is true, returns the name and description. Else 'attrs' is not
returned as key in each dictionary (it might, if it has already been loaded previously)
[{'code':"Some template code"
'attrs': {'name': "a name", 'description': "a description"}
},
...
}
@param with_attributes: if True, fetch the attributes (names and description) for formats
"""
format_templates = {}
files = os.listdir(CFG_BIBFORMAT_TEMPLATES_PATH)
for filename in files:
if filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION) or \
filename.endswith(".xsl"):
format_templates[filename] = get_format_template(filename,
with_attributes)
return format_templates
def get_format_template_attrs(filename):
"""
Returns the attributes of the format template with given filename
The attributes are {'name', 'description'}
Caution: the function does not check that path exists or
that the format element is valid.
@param the: path to a format element
"""
attrs = {}
attrs['name'] = ""
attrs['description'] = ""
try:
template_file = open("%s%s%s" % (CFG_BIBFORMAT_TEMPLATES_PATH,
os.sep,
filename))
code = template_file.read()
template_file.close()
match = None
if filename.endswith(".xsl"):
# .xsl
attrs['name'] = filename[:-4]
else:
# .bft
match = pattern_format_template_name.search(code)
if match is not None:
attrs['name'] = match.group('name')
else:
attrs['name'] = filename
match = pattern_format_template_desc.search(code)
if match is not None:
attrs['description'] = match.group('desc').rstrip('.')
except Exception, e:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_READ_TEMPLATE_FILE",
filename, str(e))],
stream='error', ln=CFG_SITE_LANG)
register_errors(errors, 'error')
attrs['name'] = filename
return attrs
def get_format_element(element_name, verbose=0, with_built_in_params=False):
"""
Returns the format element structured content.
Return None if element cannot be loaded (file not found, not readable or
invalid)
The returned structure is {'attrs': {some attributes in dict. See get_format_element_attrs_from_*}
'code': the_function_code,
'type':"field" or "python" depending if element is defined in file or table,
'escape_function': the function to call to know if element output must be escaped}
@param element_name: the name of the format element to load
@param verbose: the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@param with_built_in_params: if True, load the parameters built in all elements
@return: a dictionary with format element attributes
"""
# Get from cache whenever possible
global format_elements_cache
errors = []
# Resolve filename and prepare 'name' as key for the cache
filename = resolve_format_element_filename(element_name)
if filename is not None:
name = filename.upper()
else:
name = element_name.upper()
if format_elements_cache.has_key(name):
element = format_elements_cache[name]
if not with_built_in_params or \
(with_built_in_params and \
element['attrs'].has_key('builtin_params')):
return element
if filename is None:
# Element is maybe in tag table
if bibformat_dblayer.tag_exists_for_name(element_name):
format_element = {'attrs': get_format_element_attrs_from_table( \
element_name,
with_built_in_params),
'code':None,
'escape_function':None,
'type':"field"}
# Cache and returns
format_elements_cache[name] = format_element
return format_element
else:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_FORMAT_ELEMENT_NOT_FOUND",
element_name)],
stream='error', ln=CFG_SITE_LANG)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
sys.stderr.write(errors[0][1])
return None
else:
format_element = {}
module_name = filename
if module_name.endswith(".py"):
module_name = module_name[:-3]
# Load element
try:
module = __import__(CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH + \
"." + module_name)
# Load last module in import path
# For eg. load bfe_name in
# invenio.bibformat_elements.bfe_name
# Used to keep flexibility regarding where elements
# directory is (for eg. test cases)
components = CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH.split(".")
for comp in components[1:]:
module = getattr(module, comp)
except Exception, e:
# We catch all exceptions here, as we just want to print
# traceback in all cases
tb = sys.exc_info()[2]
stack = traceback.format_exception(Exception, e, tb, limit=None)
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_IN_FORMAT_ELEMENT",
element_name,"\n" + "\n".join(stack[-2:-1]))],
stream='error', ln=CFG_SITE_LANG)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
sys.stderr.write(errors[0][1])
if errors:
if verbose >= 7:
raise Exception, errors[0][1]
return None
# Load function 'format()' inside element
try:
function_format = module.__dict__[module_name].format
format_element['code'] = function_format
except AttributeError, e:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_FORMAT_ELEMENT_FORMAT_FUNCTION",
element_name)],
stream='error', ln=CFG_SITE_LANG)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
sys.stderr.write(errors[0][1])
if errors:
if verbose >= 7:
raise Exception, errors[0][1]
return None
# Load function 'escape_values()' inside element
function_escape = getattr(module.__dict__[module_name],
'escape_values',
None)
format_element['escape_function'] = function_escape
# Prepare, cache and return
format_element['attrs'] = get_format_element_attrs_from_function( \
function_format,
element_name,
with_built_in_params)
format_element['type'] = "python"
format_elements_cache[name] = format_element
return format_element
def get_format_elements(with_built_in_params=False):
"""
Returns the list of format elements attributes as dictionary structure
Elements declared in files have priority over element declared in 'tag' table
The returned object has this format:
{element_name1: {'attrs': {'description':..., 'seealso':...
'params':[{'name':..., 'default':..., 'description':...}, ...]
'builtin_params':[{'name':..., 'default':..., 'description':...}, ...]
},
'code': code_of_the_element
},
element_name2: {...},
...}
Returns only elements that could be loaded (not error in code)
@return: a dict of format elements with name as key, and a dict as attributes
@param with_built_in_params: if True, load the parameters built in all elements
"""
format_elements = {}
mappings = bibformat_dblayer.get_all_name_tag_mappings()
for name in mappings:
format_elements[name.upper().replace(" ", "_").strip()] = get_format_element(name, with_built_in_params=with_built_in_params)
files = os.listdir(CFG_BIBFORMAT_ELEMENTS_PATH)
for filename in files:
filename_test = filename.upper().replace(" ", "_")
if filename_test.endswith(".PY") and filename.upper() != "__INIT__.PY":
if filename_test.startswith("BFE_"):
filename_test = filename_test[4:]
element_name = filename_test[:-3]
element = get_format_element(element_name,
with_built_in_params=with_built_in_params)
if element is not None:
format_elements[element_name] = element
return format_elements
def get_format_element_attrs_from_function(function, element_name,
with_built_in_params=False):
""" Returns the attributes of the
function given as parameter.
It looks for standard parameters of the function, default
values and comments in the docstring.
The attributes are {'description', 'seealso':['element.py', ...],
'params':{name:{'name', 'default', 'description'}, ...], name2:{}}
The attributes are {'name' : "name of element" #basically the name of 'name' parameter
'description': "a string description of the element",
'seealso' : ["element_1.py", "element_2.py", ...] #a list of related elements
'params': [{'name':"param_name", #a list of parameters for this element (except 'bfo')
'default':"default value",
'description': "a description"}, ...],
'builtin_params': {name: {'name':"param_name",#the parameters builtin for all elem of this kind
'default':"default value",
'description': "a description"}, ...},
}
@param function: the formatting function of a format element
@param element_name: the name of the element
@param with_built_in_params: if True, load the parameters built in all elements
"""
attrs = {}
attrs['description'] = ""
attrs['name'] = element_name.replace(" ", "_").upper()
attrs['seealso'] = []
docstring = function.__doc__
if isinstance(docstring, str):
# Look for function description in docstring
#match = pattern_format_element_desc.search(docstring)
description = docstring.split("@param")[0]
description = description.split("@see:")[0]
attrs['description'] = description.strip().rstrip('.')
# Look for @see: in docstring
match = pattern_format_element_seealso.search(docstring)
if match is not None:
elements = match.group('see').rstrip('.').split(",")
for element in elements:
attrs['seealso'].append(element.strip())
params = {}
# Look for parameters in function definition
(args, varargs, varkw, defaults) = inspect.getargspec(function)
# Prepare args and defaults_list such that we can have a mapping
# from args to defaults
args.reverse()
if defaults is not None:
defaults_list = list(defaults)
defaults_list.reverse()
else:
defaults_list = []
for arg, default in map(None, args, defaults_list):
if arg == "bfo":
#Don't keep this as parameter. It is hidden to users, and
#exists in all elements of this kind
continue
param = {}
param['name'] = arg
if default is None:
#In case no check is made inside element, we prefer to
#print "" (nothing) than None in output
param['default'] = ""
else:
param['default'] = default
param['description'] = "(no description provided)"
params[arg] = param
if isinstance(docstring, str):
# Look for AT param descriptions in docstring.
# Add description to existing parameters in params dict
params_iterator = pattern_format_element_params.finditer(docstring)
for match in params_iterator:
name = match.group('name')
if params.has_key(name):
params[name]['description'] = match.group('desc').rstrip('.')
attrs['params'] = params.values()
# Load built-in parameters if necessary
if with_built_in_params:
builtin_params = []
# Add 'prefix' parameter
param_prefix = {}
param_prefix['name'] = "prefix"
param_prefix['default'] = ""
param_prefix['description'] = """A prefix printed only if the
record has a value for this element"""
builtin_params.append(param_prefix)
# Add 'suffix' parameter
param_suffix = {}
param_suffix['name'] = "suffix"
param_suffix['default'] = ""
param_suffix['description'] = """A suffix printed only if the
record has a value for this element"""
builtin_params.append(param_suffix)
# Add 'default' parameter
param_default = {}
param_default['name'] = "default"
param_default['default'] = ""
param_default['description'] = """A default value printed if the
record has no value for this element"""
builtin_params.append(param_default)
# Add 'escape' parameter
param_escape = {}
param_escape['name'] = "escape"
param_escape['default'] = ""
param_escape['description'] = """0 keeps value as it is. Refers to main
documentation for escaping modes
1 to 6"""
builtin_params.append(param_escape)
attrs['builtin_params'] = builtin_params
return attrs
def get_format_element_attrs_from_table(element_name,
with_built_in_params=False):
"""
Returns the attributes of the format element with given name in 'tag' table.
Returns None if element_name does not exist in tag table.
The attributes are {'name' : "name of element" #basically the name of 'element_name' parameter
'description': "a string description of the element",
'seealso' : [] #a list of related elements. Always empty in this case
'params': [], #a list of parameters for this element. Always empty in this case
'builtin_params': [{'name':"param_name", #the parameters builtin for all elem of this kind
'default':"default value",
'description': "a description"}, ...],
'tags':["950.1", 203.a] #the list of tags printed by this element
}
@param element_name: an element name in database
@param element_name: the name of the element
@param with_built_in_params: if True, load the parameters built in all elements
"""
attrs = {}
tags = bibformat_dblayer.get_tags_from_name(element_name)
field_label = "field"
if len(tags)>1:
field_label = "fields"
attrs['description'] = "Prints %s %s of the record" % (field_label,
", ".join(tags))
attrs['name'] = element_name.replace(" ", "_").upper()
attrs['seealso'] = []
attrs['params'] = []
attrs['tags'] = tags
# Load built-in parameters if necessary
if with_built_in_params:
builtin_params = []
# Add 'prefix' parameter
param_prefix = {}
param_prefix['name'] = "prefix"
param_prefix['default'] = ""
param_prefix['description'] = """A prefix printed only if the
record has a value for this element"""
builtin_params.append(param_prefix)
# Add 'suffix' parameter
param_suffix = {}
param_suffix['name'] = "suffix"
param_suffix['default'] = ""
param_suffix['description'] = """A suffix printed only if the
record has a value for this element"""
builtin_params.append(param_suffix)
# Add 'separator' parameter
param_separator = {}
param_separator['name'] = "separator"
param_separator['default'] = " "
param_separator['description'] = """A separator between elements of
the field"""
builtin_params.append(param_separator)
# Add 'nbMax' parameter
param_nbMax = {}
param_nbMax['name'] = "nbMax"
param_nbMax['default'] = ""
param_nbMax['description'] = """The maximum number of values to
print for this element. No limit if not
specified"""
builtin_params.append(param_nbMax)
# Add 'default' parameter
param_default = {}
param_default['name'] = "default"
param_default['default'] = ""
param_default['description'] = """A default value printed if the
record has no value for this element"""
builtin_params.append(param_default)
# Add 'escape' parameter
param_escape = {}
param_escape['name'] = "escape"
param_escape['default'] = ""
param_escape['description'] = """If set to 1, replaces special
characters '&', '<' and '>' of this
element by SGML entities"""
builtin_params.append(param_escape)
attrs['builtin_params'] = builtin_params
return attrs
def get_output_format(code, with_attributes=False, verbose=0):
"""
Returns the structured content of the given output format
If 'with_attributes' is true, also returns the names and description of the output formats,
else 'attrs' is not returned in dict (it might, if it has already been loaded previously).
if output format corresponding to 'code' is not found return an empty structure.
See get_output_format_attrs() to learn more on the attributes
{'rules': [ {'field': "980__a",
'value': "PREPRINT",
'template': "filename_a.bft",
},
{...}
],
'attrs': {'names': {'generic':"a name", 'sn':{'en': "a name", 'fr':"un nom"}, 'ln':{'en':"a long name"}}
'description': "a description"
'code': "fnm1",
'content_type': "application/ms-excel",
'visibility': 1
}
'default':"filename_b.bft"
}
@param code: the code of an output_format
@param with_attributes: if True, fetch the attributes (names and description) for format
@param verbose: the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return: strucured content of output format
"""
+
output_format = {'rules':[], 'default':""}
filename = resolve_output_format_filename(code, verbose)
if filename is None:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_OUTPUT_FORMAT_CODE_UNKNOWN", code)],
stream='error', ln=CFG_SITE_LANG)
register_errors(errors, 'error')
if with_attributes: #Create empty attrs if asked for attributes
output_format['attrs'] = get_output_format_attrs(code, verbose)
return output_format
# Get from cache whenever possible
global format_outputs_cache
if format_outputs_cache.has_key(filename):
# If was must return with attributes but cache has not
# attributes, then load attributes
if with_attributes and not \
format_outputs_cache[filename].has_key('attrs'):
format_outputs_cache[filename]['attrs'] = get_output_format_attrs(code, verbose)
return format_outputs_cache[filename]
try:
if with_attributes:
output_format['attrs'] = get_output_format_attrs(code, verbose)
path = "%s%s%s" % (CFG_BIBFORMAT_OUTPUTS_PATH, os.sep, filename )
format_file = open(path)
current_tag = ''
for line in format_file:
line = line.strip()
if line == "":
# Ignore blank lines
continue
if line.endswith(":"):
# Retrieve tag
# Remove : spaces and eol at the end of line
clean_line = line.rstrip(": \n\r")
# The tag starts at second position
current_tag = "".join(clean_line.split()[1:]).strip()
elif line.find('---') != -1:
words = line.split('---')
template = words[-1].strip()
condition = ''.join(words[:-1])
value = ""
output_format['rules'].append({'field': current_tag,
'value': condition,
'template': template,
})
elif line.find(':') != -1:
# Default case
default = line.split(':')[1].strip()
output_format['default'] = default
except Exception, e:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_READ_OUTPUT_FILE", filename, str(e))],
stream='error', ln=CFG_SITE_LANG)
register_errors(errors, 'error')
# Cache and return
format_outputs_cache[filename] = output_format
return output_format
def get_output_format_attrs(code, verbose=0):
"""
Returns the attributes of an output format.
The attributes contain 'code', which is the short identifier of the output format
(to be given as parameter in format_record function to specify the output format),
'description', a description of the output format, 'visibility' the visibility of
the format in the output format list on public pages and 'names', the localized names
of the output format. If 'content_type' is specified then the search_engine will
send a file with this content type and with result of formatting as content to the user.
The 'names' dict always contais 'generic', 'ln' (for long name) and 'sn' (for short names)
keys. 'generic' is the default name for output format. 'ln' and 'sn' contain long and short
localized names of the output format. Only the languages for which a localization exist
are used.
{'names': {'generic':"a name", 'sn':{'en': "a name", 'fr':"un nom"}, 'ln':{'en':"a long name"}}
'description': "a description"
'code': "fnm1",
'content_type': "application/ms-excel",
'visibility': 1
}
@param code: the short identifier of the format
@param verbose: the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return: strucured content of output format attributes
"""
if code.endswith("."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION):
code = code[:-(len(CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION) + 1)]
attrs = {'names':{'generic':"",
'ln':{},
'sn':{}},
'description':'',
'code':code.upper(),
'content_type':"",
'visibility':1}
filename = resolve_output_format_filename(code, verbose)
if filename is None:
return attrs
attrs['names'] = bibformat_dblayer.get_output_format_names(code)
attrs['description'] = bibformat_dblayer.get_output_format_description(code)
attrs['content_type'] = bibformat_dblayer.get_output_format_content_type(code)
attrs['visibility'] = bibformat_dblayer.get_output_format_visibility(code)
return attrs
def get_output_formats(with_attributes=False):
"""
Returns the list of all output format, as a dictionary with their filename as key
If 'with_attributes' is true, also returns the names and description of the output formats,
else 'attrs' is not returned in dicts (it might, if it has already been loaded previously).
See get_output_format_attrs() to learn more on the attributes
{'filename_1.bfo': {'rules': [ {'field': "980__a",
'value': "PREPRINT",
'template': "filename_a.bft",
},
{...}
],
'attrs': {'names': {'generic':"a name", 'sn':{'en': "a name", 'fr':"un nom"}, 'ln':{'en':"a long name"}}
'description': "a description"
'code': "fnm1"
}
'default':"filename_b.bft"
},
'filename_2.bfo': {...},
...
}
@return: the list of output formats
"""
output_formats = {}
files = os.listdir(CFG_BIBFORMAT_OUTPUTS_PATH)
for filename in files:
if filename.endswith("."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION):
code = "".join(filename.split(".")[:-1])
output_formats[filename] = get_output_format(code, with_attributes)
return output_formats
def resolve_format_element_filename(string):
"""
Returns the filename of element corresponding to string
This is necessary since format templates code call
elements by ignoring case, for eg. is the
same as .
It is also recommended that format elements filenames are
prefixed with bfe_ . We need to look for these too.
The name of the element has to start with "BFE_".
@param name: a name for a format element
@return: the corresponding filename, with right case
"""
if not string.endswith(".py"):
name = string.replace(" ", "_").upper() +".PY"
else:
name = string.replace(" ", "_").upper()
files = os.listdir(CFG_BIBFORMAT_ELEMENTS_PATH)
for filename in files:
test_filename = filename.replace(" ", "_").upper()
if test_filename == name or \
test_filename == "BFE_" + name or \
"BFE_" + test_filename == name:
return filename
# No element with that name found
# Do not log error, as it might be a normal execution case:
# element can be in database
return None
def resolve_output_format_filename(code, verbose=0):
"""
Returns the filename of output corresponding to code
This is necessary since output formats names are not case sensitive
but most file systems are.
@param code: the code for an output format
@param verbose: the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return: the corresponding filename, with right case, or None if not found
"""
#Remove non alphanumeric chars (except . and _)
code = re.sub(r"[^.0-9a-zA-Z_]", "", code)
if not code.endswith("."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION):
code = re.sub(r"\W", "", code)
code += "."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION
files = os.listdir(CFG_BIBFORMAT_OUTPUTS_PATH)
for filename in files:
if filename.upper() == code.upper():
return filename
# No output format with that name found
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_RESOLVE_OUTPUT_NAME", code)],
stream='error', ln=CFG_SITE_LANG)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
sys.stderr.write(errors[0][1])
if verbose >= 9:
sys.exit(errors[0][1])
return None
def get_fresh_format_template_filename(name):
"""
Returns a new filename and name for template with given name.
Used when writing a new template to a file, so that the name
has no space, is unique in template directory
Returns (unique_filename, modified_name)
@param a: name for a format template
@return: the corresponding filename, and modified name if necessary
"""
#name = re.sub(r"\W", "", name) #Remove non alphanumeric chars
name = name.replace(" ", "_")
filename = name
# Remove non alphanumeric chars (except .)
filename = re.sub(r"[^.0-9a-zA-Z]", "", filename)
path = CFG_BIBFORMAT_TEMPLATES_PATH + os.sep + filename \
+ "." + CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION
index = 1
while os.path.exists(path):
index += 1
filename = name + str(index)
path = CFG_BIBFORMAT_TEMPLATES_PATH + os.sep + filename \
+ "." + CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION
if index > 1:
returned_name = (name + str(index)).replace("_", " ")
else:
returned_name = name.replace("_", " ")
return (filename + "." + CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION,
returned_name) #filename.replace("_", " "))
def get_fresh_output_format_filename(code):
"""
Returns a new filename for output format with given code.
Used when writing a new output format to a file, so that the code
has no space, is unique in output format directory. The filename
also need to be at most 6 chars long, as the convention is that
filename == output format code (+ .extension)
We return an uppercase code
Returns (unique_filename, modified_code)
@param code: the code of an output format
@return: the corresponding filename, and modified code if necessary
"""
#code = re.sub(r"\W", "", code) #Remove non alphanumeric chars
code = code.upper().replace(" ", "_")
# Remove non alphanumeric chars (except . and _)
code = re.sub(r"[^.0-9a-zA-Z_]", "", code)
if len(code) > 6:
code = code[:6]
filename = code
path = CFG_BIBFORMAT_OUTPUTS_PATH + os.sep + filename \
+ "." + CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION
index = 2
while os.path.exists(path):
filename = code + str(index)
if len(filename) > 6:
filename = code[:-(len(str(index)))]+str(index)
index += 1
path = CFG_BIBFORMAT_OUTPUTS_PATH + os.sep + filename \
+ "." + CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION
# We should not try more than 99999... Well I don't see how we
# could get there.. Sanity check.
if index >= 99999:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_NB_OUTPUTS_LIMIT_REACHED", code)],
stream='error', ln=CFG_SITE_LANG)
register_errors(errors, 'error')
sys.exit("Output format cannot be named as %s"%code)
return (filename + "." + CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION, filename)
def clear_caches():
"""
Clear the caches (Output Format, Format Templates and Format Elements)
"""
global format_templates_cache, format_elements_cache, format_outputs_cache
format_templates_cache = {}
format_elements_cache = {}
format_outputs_cache = {}
class BibFormatObject:
"""
An object that encapsulates a record and associated methods, and that is given
as parameter to all format elements 'format' function.
The object is made specifically for a given formatting, i.e. it includes
for example the language for the formatting.
The object provides basic accessors to the record. For full access, one can get
the record with get_record() and then use BibRecord methods on the returned object.
"""
# The record
record = None
# The language in which the formatting has to be done
lang = CFG_SITE_LANG
# A list of string describing the context in which the record has
# to be formatted.
# It represents the words of the user request in web interface search
search_pattern = []
# The id of the record
recID = 0
uid = None # DEPRECATED: use bfo.user_info['uid'] instead
# The information about the user, as returned by
# 'webuser.collect_user_info(req)'
user_info = None
# The format in which the record is being formatted
format = ''
req = None # DEPRECATED: use bfo.user_info instead
def __init__(self, recID, ln=CFG_SITE_LANG, search_pattern=None,
xml_record=None, user_info=None, format=''):
"""
Creates a new bibformat object, with given record.
You can either specify an record ID to format, or give its xml representation.
if 'xml_record' is not None, use 'xml_record' instead of recID for the record.
'user_info' allows to grant access to some functionalities on
a page depending on the user's priviledges. It is a dictionary
in the following form:
user_info = {
'remote_ip' : '',
'remote_host' : '',
'referer' : '',
'uri' : '',
'agent' : '',
'apache_user' : '',
'apache_group' : [],
'uid' : -1,
'nickname' : '',
'email' : '',
'group' : [],
'guest' : '1'
}
@param recID: the id of a record
@param ln: the language in which the record has to be formatted
@param search_pattern: list of string representing the request used by the user in web interface
@param xml_record: a xml string of the record to format
@param user_info: the information of the user who will view the formatted page
@param format: the format used for formatting this record
"""
+
if xml_record is not None:
# If record is given as parameter
self.record = create_record(xml_record)[0]
recID = record_get_field_value(self.record, "001")
self.lang = wash_language(ln)
if search_pattern is None:
search_pattern = []
self.search_pattern = search_pattern
self.recID = recID
self.format = format
self.user_info = user_info
if self.user_info is None:
self.user_info = collect_user_info(None)
def get_record(self):
"""
Returns the record structure of this BibFormatObject instance
@return: the record structure as defined by BibRecord library
"""
from invenio.search_engine import get_record
+
# Create record if necessary
if self.record is None:
# on-the-fly creation if current output is xm
self.record = get_record(self.recID)
return self.record
def control_field(self, tag, escape=0):
"""
Returns the value of control field given by tag in record
@param tag: the marc code of a field
@param escape: 1 if returned value should be escaped. Else 0.
@return: value of field tag in record
"""
if self.get_record() is None:
#Case where BibRecord could not parse object
return ''
p_tag = parse_tag(tag)
field_value = record_get_field_value(self.get_record(),
p_tag[0],
p_tag[1],
p_tag[2],
p_tag[3])
if escape == 0:
return field_value
else:
return escape_field(field_value, escape)
def field(self, tag, escape=0):
"""
Returns the value of the field corresponding to tag in the
current record.
If the value does not exist, return empty string. Else
returns the same as bfo.fields(..)[0] (see docstring below).
'escape' parameter allows to escape special characters
of the field. The value of escape can be:
0 - no escaping
1 - escape all HTML characters
2 - remove unsafe HTML tags (Eg. keep )
3 - Mix of mode 1 and 2. If value of field starts with
, then use mode 2. Else use mode 1.
4 - Remove all HTML tags
5 - Same as 2, with more tags allowed (like )
6 - Same as 3, with more tags allowed (like )
@param tag: the marc code of a field
@param escape: 1 if returned value should be escaped. Else 0. (see above for other modes)
@return: value of field tag in record
"""
list_of_fields = self.fields(tag)
if len(list_of_fields) > 0:
# Escaping below
if escape == 0:
return list_of_fields[0]
else:
return escape_field(list_of_fields[0], escape)
else:
return ""
def fields(self, tag, escape=0, repeatable_subfields_p=False):
"""
Returns the list of values corresonding to "tag".
If tag has an undefined subcode (such as 999C5),
the function returns a list of dictionaries, whoose keys
are the subcodes and the values are the values of tag.subcode.
If the tag has a subcode, simply returns list of values
corresponding to tag.
Eg. for given MARC:
999C5 $a value_1a $b value_1b
999C5 $b value_2b
999C5 $b value_3b $b value_3b_bis
>> bfo.fields('999C5b')
>> ['value_1b', 'value_2b', 'value_3b', 'value_3b_bis']
>> bfo.fields('999C5')
>> [{'a':'value_1a', 'b':'value_1b'},
{'b':'value_2b'},
{'b':'value_3b'}]
By default the function returns only one value for each
subfield (that is it considers that repeatable subfields are
not allowed). It is why in the above example 'value3b_bis' is
not shown for bfo.fields('999C5'). (Note that it is not
defined which of value_3b or value_3b_bis is returned). This
is to simplify the use of the function, as most of the time
subfields are not repeatable (in that way we get a string
instead of a list). You can allow repeatable subfields by
setting 'repeatable_subfields_p' parameter to True. In
this mode, the above example would return:
>> bfo.fields('999C5b', repeatable_subfields_p=True)
>> ['value_1b', 'value_2b', 'value_3b']
>> bfo.fields('999C5', repeatable_subfields_p=True)
>> [{'a':['value_1a'], 'b':['value_1b']},
{'b':['value_2b']},
{'b':['value_3b', 'value3b_bis']}]
NOTICE THAT THE RETURNED STRUCTURE IS DIFFERENT. Also note
that whatever the value of 'repeatable_subfields_p' is,
bfo.fields('999C5b') always show all fields, even repeatable
ones. This is because the parameter has no impact on the
returned structure (it is always a list).
'escape' parameter allows to escape special characters
of the fields. The value of escape can be:
0 - no escaping
1 - escape all HTML characters
2 - remove unsafe HTML tags (Eg. keep )
3 - Mix of mode 1 and 2. If value of field starts with
, then use mode 2. Else use mode 1.
4 - Remove all HTML tags
5 - Same as 2, with more tags allowed (like )
6 - Same as 3, with more tags allowed (like )
@param tag: the marc code of a field
@param escape: 1 if returned values should be escaped. Else 0.
@repeatable_subfields_p if True, returns the list of subfields in the dictionary
@return: values of field tag in record
"""
+
if self.get_record() is None:
# Case where BibRecord could not parse object
return []
p_tag = parse_tag(tag)
if p_tag[3] != "":
# Subcode has been defined. Simply returns list of values
values = record_get_field_values(self.get_record(),
p_tag[0],
p_tag[1],
p_tag[2],
p_tag[3])
if escape == 0:
return values
else:
return [escape_field(value, escape) for value in values]
else:
# Subcode is undefined. Returns list of dicts.
# However it might be the case of a control field.
instances = record_get_field_instances(self.get_record(),
p_tag[0],
p_tag[1],
p_tag[2])
if repeatable_subfields_p:
list_of_instances = []
for instance in instances:
instance_dict = {}
for subfield in instance[0]:
if not instance_dict.has_key(subfield[0]):
instance_dict[subfield[0]] = []
if escape == 0:
instance_dict[subfield[0]].append(subfield[1])
else:
instance_dict[subfield[0]].append(escape_field(subfield[1], escape))
list_of_instances.append(instance_dict)
return list_of_instances
else:
if escape == 0:
return [dict(instance[0]) for instance in instances]
else:
return [dict([ (subfield[0], escape_field(subfield[1], escape)) \
for subfield in instance[0] ]) \
for instance in instances]
def kb(self, kb, string, default=""):
"""
Returns the value of the "string" in the knowledge base "kb".
If kb does not exist or string does not exist in kb,
returns 'default' string or empty string if not specified.
@param kb: a knowledge base name
@param string: the string we want to translate
@param default: a default value returned if 'string' not found in 'kb'
"""
if string is None:
return default
val = get_kbr_values(kb, searchkey=string, searchtype='e')
try:
return val[0][0]
except:
return default
def escape_field(value, mode=0):
"""
Utility function used to escape the value of a field in given mode.
- mode 0: no escaping
- mode 1: escaping all HTML/XML characters (escaped chars are shown as escaped)
- mode 2: escaping unsafe HTML tags to avoid XSS, but
keep basic one (such as )
Escaped tags are removed.
- mode 3: mix of mode 1 and mode 2. If field_value starts with ,
then use mode 2. Else use mode 1.
- mode 4: escaping all HTML/XML tags (escaped tags are removed)
- mode 5: same as 2, but allows more tags, like
- mode 6: same as 3, but allows more tags, like
"""
if mode == 1:
return cgi.escape(value)
elif mode in [2, 5]:
allowed_attribute_whitelist = cfg_html_buffer_allowed_attribute_whitelist
allowed_tag_whitelist = cfg_html_buffer_allowed_tag_whitelist + \
('class',)
if mode == 5:
allowed_attribute_whitelist += ('src', 'alt',
'width', 'height',
'style', 'summary',
'border', 'cellspacing',
'cellpadding')
allowed_tag_whitelist += ('img', 'table', 'td',
'tr', 'th', 'span', 'caption')
try:
return washer.wash(value,
allowed_attribute_whitelist=\
allowed_attribute_whitelist,
allowed_tag_whitelist= \
allowed_tag_whitelist
)
except HTMLParseError:
# Parsing failed
return cgi.escape(value)
elif mode in [3, 6]:
if value.lstrip(' \n').startswith(html_field):
allowed_attribute_whitelist = cfg_html_buffer_allowed_attribute_whitelist
allowed_tag_whitelist = cfg_html_buffer_allowed_tag_whitelist + \
('class',)
if mode == 6:
allowed_attribute_whitelist += ('src', 'alt',
'width', 'height',
'style', 'summary',
'border', 'cellspacing',
'cellpadding')
allowed_tag_whitelist += ('img', 'table', 'td',
'tr', 'th', 'span', 'caption')
try:
return washer.wash(value,
allowed_attribute_whitelist=\
allowed_attribute_whitelist,
allowed_tag_whitelist=\
allowed_tag_whitelist
)
except HTMLParseError:
# Parsing failed
return cgi.escape(value)
else:
return cgi.escape(value)
elif mode == 4:
try:
return washer.wash(value,
allowed_attribute_whitelist=[],
allowed_tag_whitelist=[]
)
except HTMLParseError:
# Parsing failed
return cgi.escape(value)
else:
return value
def bf_profile():
"""
Runs a benchmark
"""
for i in range(1, 51):
format_record(i, "HD", ln=CFG_SITE_LANG, verbose=9, search_pattern=[])
return
if __name__ == "__main__":
import profile
import pstats
#bf_profile()
profile.run('bf_profile()', "bibformat_profile")
p = pstats.Stats("bibformat_profile")
p.strip_dirs().sort_stats("cumulative").print_stats()
diff --git a/modules/bibformat/lib/bibformat_engine_tests.py b/modules/bibformat/lib/bibformat_engine_tests.py
index bc7d4f47b..8dcc01434 100644
--- a/modules/bibformat/lib/bibformat_engine_tests.py
+++ b/modules/bibformat/lib/bibformat_engine_tests.py
@@ -1,805 +1,849 @@
# -*- coding: utf-8 -*-
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Test cases for the BibFormat engine. Also test
some utilities function in bibformat_utils module"""
__revision__ = "$Id$"
# pylint: disable-msg=C0301
import unittest
import os
import sys
+from invenio import bibformat
from invenio import bibformat_engine
from invenio import bibformat_utils
from invenio import bibformat_config
from invenio import bibformatadminlib
from invenio import bibrecord
from invenio.config import CFG_TMPDIR
from invenio.testutils import make_test_suite, run_test_suite
#CFG_BIBFORMAT_OUTPUTS_PATH = "..%setc%soutput_formats" % (os.sep, os.sep)
#CFG_BIBFORMAT_TEMPLATES_PATH = "..%setc%sformat_templates" % (os.sep, os.sep)
#CFG_BIBFORMAT_ELEMENTS_PATH = "elements"
CFG_BIBFORMAT_OUTPUTS_PATH = "%s" % (CFG_TMPDIR)
CFG_BIBFORMAT_TEMPLATES_PATH = "%s" % (CFG_TMPDIR)
CFG_BIBFORMAT_ELEMENTS_PATH = "%s%stests_bibformat_elements" % (CFG_TMPDIR, os.sep)
CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH = "tests_bibformat_elements"
class FormatTemplateTest(unittest.TestCase):
""" bibformat - tests on format templates"""
def test_get_format_template(self):
"""bibformat - format template parsing and returned structure"""
bibformat_engine.CFG_BIBFORMAT_TEMPLATES_PATH = CFG_BIBFORMAT_TEMPLATES_PATH
#Test correct parsing and structure
template_1 = bibformat_engine.get_format_template("Test1.bft", with_attributes=True)
self.assert_(template_1 is not None)
self.assertEqual(template_1['code'], "test\nthis value should stay as it is\nthis one too\n")
self.assertEqual(template_1['attrs']['name'], "name_test")
self.assertEqual(template_1['attrs']['description'], "desc_test")
#Test correct parsing and structure of file without description or name
template_2 = bibformat_engine.get_format_template("Test_2.bft", with_attributes=True)
self.assert_(template_2 is not None)
self.assertEqual(template_2['code'], "test")
self.assertEqual(template_2['attrs']['name'], "Test_2.bft")
self.assertEqual(template_2['attrs']['description'], "")
#Test correct parsing and structure of file without description or name
unknown_template = bibformat_engine.get_format_template("test_no_template.test", with_attributes=True)
self.assertEqual(unknown_template, None)
def test_get_format_templates(self):
""" bibformat - loading multiple format templates"""
bibformat_engine.CFG_BIBFORMAT_TEMPLATES_PATH = CFG_BIBFORMAT_TEMPLATES_PATH
templates = bibformat_engine.get_format_templates(with_attributes=True)
#test correct loading
self.assert_("Test1.bft" in templates.keys())
self.assert_("Test_2.bft" in templates.keys())
self.assert_("Test3.bft" in templates.keys())
self.assert_("Test_no_template.test" not in templates.keys())
#Test correct pasrsing and structure
self.assertEqual(templates['Test1.bft']['code'], "test\nthis value should stay as it is\nthis one too\n")
self.assertEqual(templates['Test1.bft']['attrs']['name'], "name_test")
self.assertEqual(templates['Test1.bft']['attrs']['description'], "desc_test")
def test_get_format_template_attrs(self):
""" bibformat - correct parsing of attributes in format template"""
bibformat_engine.CFG_BIBFORMAT_TEMPLATES_PATH = CFG_BIBFORMAT_TEMPLATES_PATH
attrs = bibformat_engine.get_format_template_attrs("Test1.bft")
self.assertEqual(attrs['name'], "name_test")
self.assertEqual(attrs['description'], "desc_test")
def test_get_fresh_format_template_filename(self):
""" bibformat - getting fresh filename for format template"""
bibformat_engine.CFG_BIBFORMAT_TEMPLATES_PATH = CFG_BIBFORMAT_TEMPLATES_PATH
filename_and_name_1 = bibformat_engine.get_fresh_format_template_filename("Test")
self.assert_(len(filename_and_name_1) >= 2)
self.assertEqual(filename_and_name_1[0], "Test.bft")
filename_and_name_2 = bibformat_engine.get_fresh_format_template_filename("Test1")
self.assert_(len(filename_and_name_2) >= 2)
self.assert_(filename_and_name_2[0] != "Test1.bft")
path = bibformat_engine.CFG_BIBFORMAT_TEMPLATES_PATH + os.sep + filename_and_name_2[0]
self.assert_(not os.path.exists(path))
class FormatElementTest(unittest.TestCase):
""" bibformat - tests on format templates"""
def setUp(self):
# pylint: disable-msg=C0103
"""bibformat - setting python path to test elements"""
sys.path.append('%s' % CFG_TMPDIR)
def test_resolve_format_element_filename(self):
"""bibformat - resolving format elements filename """
bibformat_engine.CFG_BIBFORMAT_ELEMENTS_PATH = CFG_BIBFORMAT_ELEMENTS_PATH
#Test elements filename starting without bfe_, with underscore instead of space
filenames = ["test 1", "test 1.py", "bfe_test 1", "bfe_test 1.py", "BFE_test 1",
"BFE_TEST 1", "BFE_TEST 1.py", "BFE_TeST 1.py", "BFE_TeST 1",
"BfE_TeST 1.py", "BfE_TeST 1","test_1", "test_1.py", "bfe_test_1",
"bfe_test_1.py", "BFE_test_1",
"BFE_TEST_1", "BFE_TEST_1.py", "BFE_Test_1.py", "BFE_TeST_1",
"BfE_TeST_1.py", "BfE_TeST_1"]
for i in range(len(filenames)-2):
filename_1 = bibformat_engine.resolve_format_element_filename(filenames[i])
self.assert_(filename_1 is not None)
filename_2 = bibformat_engine.resolve_format_element_filename(filenames[i+1])
self.assertEqual(filename_1, filename_2)
#Test elements filename starting with bfe_, and with underscores instead of spaces
filenames = ["test 2", "test 2.py", "bfe_test 2", "bfe_test 2.py", "BFE_test 2",
"BFE_TEST 2", "BFE_TEST 2.py", "BFE_TeST 2.py", "BFE_TeST 2",
"BfE_TeST 2.py", "BfE_TeST 2","test_2", "test_2.py", "bfe_test_2",
"bfe_test_2.py", "BFE_test_2",
"BFE_TEST_2", "BFE_TEST_2.py", "BFE_TeST_2.py", "BFE_TeST_2",
"BfE_TeST_2.py", "BfE_TeST_2"]
for i in range(len(filenames)-2):
filename_1 = bibformat_engine.resolve_format_element_filename(filenames[i])
self.assert_(filename_1 is not None)
filename_2 = bibformat_engine.resolve_format_element_filename(filenames[i+1])
self.assertEqual(filename_1, filename_2)
#Test non existing element
non_existing_element = bibformat_engine.resolve_format_element_filename("BFE_NON_EXISTING_ELEMENT")
self.assertEqual(non_existing_element, None)
def test_get_format_element(self):
"""bibformat - format elements parsing and returned structure"""
bibformat_engine.CFG_BIBFORMAT_ELEMENTS_PATH = CFG_BIBFORMAT_ELEMENTS_PATH
bibformat_engine.CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH = CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH
#Test loading with different kind of names, for element with spaces in name, without bfe_
element_1 = bibformat_engine.get_format_element("test 1", with_built_in_params=True)
self.assert_(element_1 is not None)
element_1_bis = bibformat_engine.get_format_element("bfe_tEst_1.py", with_built_in_params=True)
self.assertEqual(element_1, element_1_bis)
#Test loading with different kind of names, for element without spaces in name, wit bfe_
element_2 = bibformat_engine.get_format_element("test 2", with_built_in_params=True)
self.assert_(element_2 is not None)
element_2_bis = bibformat_engine.get_format_element("bfe_tEst_2.py", with_built_in_params=True)
self.assertEqual(element_2, element_2_bis)
#Test loading incorrect elements
element_3 = bibformat_engine.get_format_element("test 3", with_built_in_params=True)
self.assertEqual(element_3, None)
element_4 = bibformat_engine.get_format_element("test 4", with_built_in_params=True)
self.assertEqual(element_4, None)
unknown_element = bibformat_engine.get_format_element("TEST_NO_ELEMENT", with_built_in_params=True)
self.assertEqual(unknown_element, None)
#Test element without docstring
element_5 = bibformat_engine.get_format_element("test_5", with_built_in_params=True)
self.assert_(element_5 is not None)
self.assertEqual(element_5['attrs']['description'], '')
self.assert_({'name':"param1",
'description':"(no description provided)",
'default':""} in element_5['attrs']['params'] )
self.assertEqual(element_5['attrs']['seealso'], [])
#Test correct parsing:
#Test type of element
self.assertEqual(element_1['type'], "python")
#Test name = element filename, with underscore instead of spaces,
#without BFE_ and uppercase
self.assertEqual(element_1['attrs']['name'], "TEST_1")
#Test description parsing
self.assertEqual(element_1['attrs']['description'], "Prints test")
#Test @see: parsing
self.assertEqual(element_1['attrs']['seealso'], ["element2.py", "unknown_element.py"])
#Test @param parsing
self.assert_({'name':"param1",
'description':"desc 1",
'default':""} in element_1['attrs']['params'] )
self.assert_({'name':"param2",
'description':"desc 2",
'default':"default value"} in element_1['attrs']['params'] )
#Test non existing element
non_existing_element = bibformat_engine.get_format_element("BFE_NON_EXISTING_ELEMENT")
self.assertEqual(non_existing_element, None)
def test_get_format_element_attrs_from_function(self):
""" bibformat - correct parsing of attributes in 'format' docstring"""
bibformat_engine.CFG_BIBFORMAT_ELEMENTS_PATH = CFG_BIBFORMAT_ELEMENTS_PATH
bibformat_engine.CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH = CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH
element_1 = bibformat_engine.get_format_element("test 1", with_built_in_params=True)
function = element_1['code']
attrs = bibformat_engine.get_format_element_attrs_from_function(function,
element_1['attrs']['name'],
with_built_in_params=True)
self.assertEqual(attrs['name'], "TEST_1")
#Test description parsing
self.assertEqual(attrs['description'], "Prints test")
#Test @see: parsing
self.assertEqual(attrs['seealso'], ["element2.py", "unknown_element.py"])
def test_get_format_elements(self):
"""bibformat - multiple format elements parsing and returned structure"""
bibformat_engine.CFG_BIBFORMAT_ELEMENTS_PATH = CFG_BIBFORMAT_ELEMENTS_PATH
bibformat_engine.CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH = CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH
elements = bibformat_engine.get_format_elements()
self.assert_(isinstance(elements, dict))
self.assertEqual(elements['TEST_1']['attrs']['name'], "TEST_1")
self.assertEqual(elements['TEST_2']['attrs']['name'], "TEST_2")
self.assert_("TEST_3" not in elements.keys())
self.assert_("TEST_4" not in elements.keys())
def test_get_tags_used_by_element(self):
"""bibformat - identification of tag usage inside element"""
bibformat_engine.CFG_BIBFORMAT_ELEMENTS_PATH = bibformat_config.CFG_BIBFORMAT_ELEMENTS_PATH
bibformat_engine.CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH = bibformat_config.CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH
tags = bibformatadminlib.get_tags_used_by_element('bfe_abstract.py')
self.failUnless(len(tags) == 4,
'Could not correctly identify tags used in bfe_abstract.py')
class OutputFormatTest(unittest.TestCase):
""" bibformat - tests on output formats"""
def test_get_output_format(self):
""" bibformat - output format parsing and returned structure """
bibformat_engine.CFG_BIBFORMAT_OUTPUTS_PATH = CFG_BIBFORMAT_OUTPUTS_PATH
filename_1 = bibformat_engine.resolve_output_format_filename("test1")
output_1 = bibformat_engine.get_output_format(filename_1, with_attributes=True)
self.assertEqual(output_1['attrs']['names']['generic'], "")
self.assert_(isinstance(output_1['attrs']['names']['ln'], dict))
self.assert_(isinstance(output_1['attrs']['names']['sn'], dict))
self.assertEqual(output_1['attrs']['code'], "TEST1")
self.assert_(len(output_1['attrs']['code']) <= 6)
self.assertEqual(len(output_1['rules']), 4)
self.assertEqual(output_1['rules'][0]['field'], '980.a')
self.assertEqual(output_1['rules'][0]['template'], 'Picture_HTML_detailed.bft')
self.assertEqual(output_1['rules'][0]['value'], 'PICTURE ')
self.assertEqual(output_1['rules'][1]['field'], '980.a')
self.assertEqual(output_1['rules'][1]['template'], 'Article.bft')
self.assertEqual(output_1['rules'][1]['value'], 'ARTICLE')
self.assertEqual(output_1['rules'][2]['field'], '980__a')
self.assertEqual(output_1['rules'][2]['template'], 'Thesis_detailed.bft')
self.assertEqual(output_1['rules'][2]['value'], 'THESIS ')
self.assertEqual(output_1['rules'][3]['field'], '980__a')
self.assertEqual(output_1['rules'][3]['template'], 'Pub.bft')
self.assertEqual(output_1['rules'][3]['value'], 'PUBLICATION ')
filename_2 = bibformat_engine.resolve_output_format_filename("TEST2")
output_2 = bibformat_engine.get_output_format(filename_2, with_attributes=True)
self.assertEqual(output_2['attrs']['names']['generic'], "")
self.assert_(isinstance(output_2['attrs']['names']['ln'], dict))
self.assert_(isinstance(output_2['attrs']['names']['sn'], dict))
self.assertEqual(output_2['attrs']['code'], "TEST2")
self.assert_(len(output_2['attrs']['code']) <= 6)
self.assertEqual(output_2['rules'], [])
unknown_output = bibformat_engine.get_output_format("unknow", with_attributes=True)
self.assertEqual(unknown_output, {'rules':[],
'default':"",
'attrs':{'names':{'generic':"", 'ln':{}, 'sn':{}},
'description':'',
'code':"UNKNOW",
'visibility': 1,
'content_type':""}})
def test_get_output_formats(self):
""" bibformat - loading multiple output formats """
bibformat_engine.CFG_BIBFORMAT_OUTPUTS_PATH = CFG_BIBFORMAT_OUTPUTS_PATH
outputs = bibformat_engine.get_output_formats(with_attributes=True)
self.assert_(isinstance(outputs, dict))
self.assert_("TEST1.bfo" in outputs.keys())
self.assert_("TEST2.bfo" in outputs.keys())
self.assert_("unknow.bfo" not in outputs.keys())
#Test correct parsing
output_1 = outputs["TEST1.bfo"]
self.assertEqual(output_1['attrs']['names']['generic'], "")
self.assert_(isinstance(output_1['attrs']['names']['ln'], dict))
self.assert_(isinstance(output_1['attrs']['names']['sn'], dict))
self.assertEqual(output_1['attrs']['code'], "TEST1")
self.assert_(len(output_1['attrs']['code']) <= 6)
def test_get_output_format_attrs(self):
""" bibformat - correct parsing of attributes in output format"""
bibformat_engine.CFG_BIBFORMAT_OUTPUTS_PATH = CFG_BIBFORMAT_OUTPUTS_PATH
attrs= bibformat_engine.get_output_format_attrs("TEST1")
self.assertEqual(attrs['names']['generic'], "")
self.assert_(isinstance(attrs['names']['ln'], dict))
self.assert_(isinstance(attrs['names']['sn'], dict))
self.assertEqual(attrs['code'], "TEST1")
self.assert_(len(attrs['code']) <= 6)
def test_resolve_output_format(self):
""" bibformat - resolving output format filename"""
bibformat_engine.CFG_BIBFORMAT_OUTPUTS_PATH = CFG_BIBFORMAT_OUTPUTS_PATH
filenames = ["test1", "test1.bfo", "TEST1", "TeST1", "TEST1.bfo", "test1"]
for i in range(len(filenames)-2):
filename_1 = bibformat_engine.resolve_output_format_filename(filenames[i])
self.assert_(filename_1 is not None)
filename_2 = bibformat_engine.resolve_output_format_filename(filenames[i+1])
self.assertEqual(filename_1, filename_2)
def test_get_fresh_output_format_filename(self):
""" bibformat - getting fresh filename for output format"""
bibformat_engine.CFG_BIBFORMAT_OUTPUTS_PATH = CFG_BIBFORMAT_OUTPUTS_PATH
filename_and_name_1 = bibformat_engine.get_fresh_output_format_filename("test")
self.assert_(len(filename_and_name_1) >= 2)
self.assertEqual(filename_and_name_1[0], "TEST.bfo")
filename_and_name_1_bis = bibformat_engine.get_fresh_output_format_filename("")
self.assert_(len(filename_and_name_1_bis) >= 2)
self.assertEqual(filename_and_name_1_bis[0], "TEST.bfo")
filename_and_name_2 = bibformat_engine.get_fresh_output_format_filename("test1")
self.assert_(len(filename_and_name_2) >= 2)
self.assert_(filename_and_name_2[0] != "TEST1.bfo")
path = bibformat_engine.CFG_BIBFORMAT_OUTPUTS_PATH + os.sep + filename_and_name_2[0]
self.assert_(not os.path.exists(path))
filename_and_name_3 = bibformat_engine.get_fresh_output_format_filename("test1testlong")
self.assert_(len(filename_and_name_3) >= 2)
self.assert_(filename_and_name_3[0] != "TEST1TESTLONG.bft")
self.assert_(len(filename_and_name_3[0]) <= 6 + 1 + len(bibformat_config.CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION))
path = bibformat_engine.CFG_BIBFORMAT_OUTPUTS_PATH + os.sep + filename_and_name_3[0]
self.assert_(not os.path.exists(path))
class PatternTest(unittest.TestCase):
""" bibformat - tests on re patterns"""
def test_pattern_lang(self):
""" bibformat - correctness of pattern 'pattern_lang'"""
text = '''
Here is my test text
Some wordsQuelques motsEinige Wörter garbage
Here ends the middle of my test text
EnglishFrançaisDeutschHere ends my test text
'''
result = bibformat_engine.pattern_lang.search(text)
self.assertEqual(result.group("langs"), "Some wordsQuelques motsEinige Wörter garbage ")
text = '''
Here is my test text
'''
result = bibformat_engine.pattern_lang.search(text)
self.assertEqual(result.group("langs"), "Some wordsQuelques motsEinige Wörter garbage ")
def test_ln_pattern(self):
""" bibformat - correctness of pattern 'ln_pattern'"""
text = "Some wordsQuelques motsEinige Wörter garbage "
result = bibformat_engine.ln_pattern.search(text)
self.assertEqual(result.group(1), "en")
self.assertEqual(result.group(2), "Some words")
def test_pattern_format_template_name(self):
""" bibformat - correctness of pattern 'pattern_format_template_name'"""
text = '''
garbage
a namea description on
2 lines
the content of the template
content
'''
result = bibformat_engine.pattern_format_template_name.search(text)
self.assertEqual(result.group('name'), "a name")
def test_pattern_format_template_desc(self):
""" bibformat - correctness of pattern 'pattern_format_template_desc'"""
text = '''
garbage
a namea description on
2 lines
the content of the template
content
'''
result = bibformat_engine.pattern_format_template_desc.search(text)
self.assertEqual(result.group('desc'), '''a description on
2 lines ''')
def test_pattern_tag(self):
""" bibformat - correctness of pattern 'pattern_tag'"""
text = '''
garbage but part of content
a namea description on
2 lines
the content of the template
my content is so nice!
'''
result = bibformat_engine.pattern_tag.search(text)
self.assertEqual(result.group('function_name'), "tiTLE")
self.assertEqual(result.group('params').strip(), '''param1="value1"
param2=""''')
def test_pattern_function_params(self):
""" bibformat - correctness of pattern 'test_pattern_function_params'"""
text = '''
param1="" param2="value2"
param3="value3" garbage
'''
names = ["param1", "param2", "param3"]
values = ["", "value2", "value3"]
results = bibformat_engine.pattern_format_element_params.finditer(text) #TODO
param_i = 0
for match in results:
self.assertEqual(match.group('param'), names[param_i])
self.assertEqual(match.group('value'), values [param_i])
param_i += 1
def test_pattern_format_element_params(self):
""" bibformat - correctness of pattern 'pattern_format_element_params'"""
text = '''
a description for my element
some text
@param param1: desc1
@param param2: desc2
@see: seethis, seethat
'''
names = ["param1", "param2"]
descriptions = ["desc1", "desc2"]
results = bibformat_engine.pattern_format_element_params.finditer(text) #TODO
param_i = 0
for match in results:
self.assertEqual(match.group('name'), names[param_i])
self.assertEqual(match.group('desc'), descriptions[param_i])
param_i += 1
def test_pattern_format_element_seealso(self):
""" bibformat - correctness of pattern 'pattern_format_element_seealso' """
text = '''
a description for my element
some text
@param param1: desc1
@param param2: desc2
@see: seethis, seethat
'''
result = bibformat_engine.pattern_format_element_seealso.search(text)
self.assertEqual(result.group('see').strip(), 'seethis, seethat')
class EscapingAndWashingTest(unittest.TestCase):
""" bibformat - test escaping and washing metadata"""
def test_escaping(self):
""" bibformat - tests escaping HTML characters"""
text = "Is 5 < 6 ? For sure! And what about True && False == True?"
result = bibformat_engine.escape_field(text, mode=0)
self.assertEqual(result, text)
result = bibformat_engine.escape_field(text, mode=1)
self.assertEqual(result, 'Is 5 < 6 ? For sure! And what about True && False == True?')
def test_washing(self):
""" bibformat - test washing HTML tags"""
text = '''Hi dude, , please login: login here'''
# Keep only basic tags
result = bibformat_engine.escape_field(text, mode=2)
self.assert_('script' not in result.lower())
self.assert_('onclick' not in result.lower())
self.assert_('mycrappywebsite' not in result.lower())
self.assert_(' ' in result.lower())
self.assert_(' ' in result.lower().replace(' ', ''))
# Keep only basic tags only if value starts with
# directive. Otherwise escape (which is the case here)
result = bibformat_engine.escape_field(text, mode=3)
self.assert_('
"""
else:
metaheaderadd = ''
## generate navtrail:
navtrail = create_navtrail_links(cc, aas, ln)
if navtrail != '':
navtrail += ' > '
if (tab != '' or ((of != '' or of.lower() != 'hd') and of != 'hb')) and \
recID != -1:
# If we are not in information tab in HD format, customize
# the nav. trail to have a link back to main record. (Due
# to the way perform_request_search() works, hb
# (lowercase) is equal to hd)
navtrail += ' %s' % \
(CFG_SITE_URL, recID, title_message)
if (of != '' or of.lower() != 'hd') and of != 'hb':
# Export
format_name = of
query = "SELECT name FROM format WHERE code=%s"
res = run_sql(query, (of,))
if res:
format_name = res[0][0]
navtrail += ' > ' + format_name
else:
# Discussion, citations, etc. tabs
tab_label = get_detailed_page_tabs(cc, ln=ln)[tab]['label']
navtrail += ' > ' + _(tab_label)
else:
navtrail += title_message
if p:
# we are serving search/browse results pages, so insert pattern:
navtrail += ": " + cgi.escape(p)
title_message = cgi.escape(p) + " - " + title_message
## finally, print page header:
req.write(pageheaderonly(req=req, title=title_message,
navtrail=navtrail,
description=description,
keywords=keywords,
metaheaderadd=metaheaderadd,
uid=uid,
language=ln,
navmenuid='search',
navtrail_append_title_p=0,
rssurl=rssurl))
req.write(websearch_templates.tmpl_search_pagestart(ln=ln))
#else:
# req.send_http_header()
def page_end(req, of="hb", ln=CFG_SITE_LANG):
"End page according to given output format: e.g. close XML tags, add HTML footer, etc."
if of == "id":
return [] # empty recID list
if not req:
return # we were called from CLI
if of.startswith('h'):
req.write(websearch_templates.tmpl_search_pageend(ln = ln)) # pagebody end
req.write(pagefooteronly(lastupdated=__lastupdated__, language=ln, req=req))
return
def create_page_title_search_pattern_info(p, p1, p2, p3):
"""Create the search pattern bit for the page web page
HTML header. Basically combine p and (p1,p2,p3) together so that
the page header may be filled whether we are in the Simple Search
or Advanced Search interface contexts."""
out = ""
if p:
out = p
else:
out = p1
if p2:
out += ' ' + p2
if p3:
out += ' ' + p3
return out
def create_inputdate_box(name="d1", selected_year=0, selected_month=0, selected_day=0, ln=CFG_SITE_LANG):
"Produces 'From Date', 'Until Date' kind of selection box. Suitable for search options."
_ = gettext_set_language(ln)
box = ""
# day
box += """"""
# month
box += """"""
# year
box += """"""
return box
def create_search_box(cc, colls, p, f, rg, sf, so, sp, rm, of, ot, aas,
ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3,
m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec,
action=""):
"""Create search box for 'search again in the results page' functionality."""
# load the right message language
_ = gettext_set_language(ln)
# some computations
cc_intl = get_coll_i18nname(cc, ln, False)
cc_colID = get_colID(cc)
colls_nicely_ordered = []
if cfg_nicely_ordered_collection_list:
colls_nicely_ordered = get_nicely_ordered_collection_list(ln=ln)
else:
colls_nicely_ordered = get_alphabetically_ordered_collection_list(ln=ln)
colls_nice = []
for (cx, cx_printable) in colls_nicely_ordered:
if not cx.startswith("Unnamed collection"):
colls_nice.append({ 'value' : cx,
'text' : cx_printable
})
coll_selects = []
if colls and colls[0] != CFG_SITE_NAME:
# some collections are defined, so print these first, and only then print 'add another collection' heading:
for c in colls:
if c:
temp = []
temp.append({ 'value' : CFG_SITE_NAME,
'text' : '*** %s ***' % _("any public collection")
})
# this field is used to remove the current collection from the ones to be searched.
temp.append({ 'value' : '',
'text' : '*** %s ***' % _("remove this collection")
})
for val in colls_nice:
# print collection:
if not cx.startswith("Unnamed collection"):
temp.append({ 'value' : val['value'],
'text' : val['text'],
'selected' : (c == re.sub("^[\s\-]*","", val['value']))
})
coll_selects.append(temp)
coll_selects.append([{ 'value' : '',
'text' : '*** %s ***' % _("add another collection")
}] + colls_nice)
else: # we searched in CFG_SITE_NAME, so print 'any public collection' heading
coll_selects.append([{ 'value' : CFG_SITE_NAME,
'text' : '*** %s ***' % _("any public collection")
}] + colls_nice)
## ranking methods
ranks = [{
'value' : '',
'text' : "- %s %s -" % (_("OR").lower (), _("rank by")),
}]
for (code, name) in get_bibrank_methods(cc_colID, ln):
# propose found rank methods:
ranks.append({
'value' : code,
'text' : name,
})
formats = []
query = """SELECT code,name FROM format WHERE visibility='1' ORDER BY name ASC"""
res = run_sql(query)
if res:
# propose found formats:
for code, name in res:
formats.append({ 'value' : code,
'text' : name
})
else:
formats.append({'value' : 'hb',
'text' : _("HTML brief")
})
# show collections in the search box? (not if there is only one
# collection defined, and not if we are in light search)
show_colls = True
show_title = True
if len(collection_reclist_cache.cache.keys()) == 1 or \
aas == -1:
show_colls = False
show_title = False
if cc == CFG_SITE_NAME:
show_title = False
return websearch_templates.tmpl_search_box(
ln = ln,
aas = aas,
cc_intl = cc_intl,
cc = cc,
ot = ot,
sp = sp,
action = action,
fieldslist = get_searchwithin_fields(ln=ln, colID=cc_colID),
f1 = f1,
f2 = f2,
f3 = f3,
m1 = m1,
m2 = m2,
m3 = m3,
p1 = p1,
p2 = p2,
p3 = p3,
op1 = op1,
op2 = op2,
rm = rm,
p = p,
f = f,
coll_selects = coll_selects,
d1y = d1y, d2y = d2y, d1m = d1m, d2m = d2m, d1d = d1d, d2d = d2d,
dt = dt,
sort_fields = get_sortby_fields(ln=ln, colID=cc_colID),
sf = sf,
so = so,
ranks = ranks,
sc = sc,
rg = rg,
formats = formats,
of = of,
pl = pl,
jrec = jrec,
ec = ec,
show_colls = show_colls,
show_title = show_title,
)
def create_navtrail_links(cc=CFG_SITE_NAME, aas=0, ln=CFG_SITE_LANG, self_p=1, tab=''):
"""Creates navigation trail links, i.e. links to collection
ancestors (except Home collection). If aas==1, then links to
Advanced Search interfaces; otherwise Simple Search.
"""
dads = []
for dad in get_coll_ancestors(cc):
if dad != CFG_SITE_NAME: # exclude Home collection
dads.append ((dad, get_coll_i18nname(dad, ln, False)))
if self_p and cc != CFG_SITE_NAME:
dads.append((cc, get_coll_i18nname(cc, ln, False)))
return websearch_templates.tmpl_navtrail_links(
aas=aas, ln=ln, dads=dads)
def get_searchwithin_fields(ln='en', colID=None):
"""Retrieves the fields name used in the 'search within' selection box for the collection ID colID."""
res = None
if colID:
res = run_sql_cached("""SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff
WHERE cff.type='sew' AND cff.id_collection=%s AND cff.id_field=f.id
ORDER BY cff.score DESC, f.name ASC""", (colID,),
affected_tables=['field', 'collection_field_fieldvalue'])
if not res:
res = run_sql_cached("SELECT code,name FROM field ORDER BY name ASC",
affected_tables=['field',])
fields = [{
'value' : '',
'text' : get_field_i18nname("any field", ln, False)
}]
for field_code, field_name in res:
if field_code and field_code != "anyfield":
fields.append({ 'value' : field_code,
'text' : get_field_i18nname(field_name, ln, False)
})
return fields
def get_sortby_fields(ln='en', colID=None):
"""Retrieves the fields name used in the 'sort by' selection box for the collection ID colID."""
_ = gettext_set_language(ln)
res = None
if colID:
res = run_sql_cached("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff
WHERE cff.type='soo' AND cff.id_collection=%s AND cff.id_field=f.id
ORDER BY cff.score DESC, f.name ASC""", (colID,),
affected_tables=['field', 'collection_field_fieldvalue'])
if not res:
# no sort fields defined for this colID, try to take Home collection:
res = run_sql_cached("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff
WHERE cff.type='soo' AND cff.id_collection=%s AND cff.id_field=f.id
ORDER BY cff.score DESC, f.name ASC""", (1,),
affected_tables=['field', 'collection_field_fieldvalue'])
if not res:
# no sort fields defined for the Home collection, take all sort fields defined wherever they are:
res = run_sql_cached("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff
WHERE cff.type='soo' AND cff.id_field=f.id
ORDER BY cff.score DESC, f.name ASC""",
affected_tables=['field', 'collection_field_fieldvalue'])
fields = [{
'value' : '',
'text' : _("latest first")
}]
for field_code, field_name in res:
if field_code and field_code != "anyfield":
fields.append({ 'value' : field_code,
'text' : get_field_i18nname(field_name, ln, False)
})
return fields
def create_andornot_box(name='op', value='', ln='en'):
"Returns HTML code for the AND/OR/NOT selection box."
_ = gettext_set_language(ln)
out = """
""" % (name,
is_selected('a', value), _("AND"),
is_selected('o', value), _("OR"),
is_selected('n', value), _("AND NOT"))
return out
def create_matchtype_box(name='m', value='', ln='en'):
"Returns HTML code for the 'match type' selection box."
_ = gettext_set_language(ln)
out = """
""" % (name,
is_selected('a', value), _("All of the words:"),
is_selected('o', value), _("Any of the words:"),
is_selected('e', value), _("Exact phrase:"),
is_selected('p', value), _("Partial phrase:"),
is_selected('r', value), _("Regular expression:"))
return out
def is_selected(var, fld):
"Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes."
if type(var) is int and type(fld) is int:
if var == fld:
return " selected"
elif str(var) == str(fld):
return " selected"
elif fld and len(fld)==3 and fld[0] == "w" and var == fld[1:]:
return " selected"
return ""
def wash_colls(cc, c, split_colls=0, verbose=0):
"""Wash collection list by checking whether user has deselected
anything under 'Narrow search'. Checks also if cc is a list or not.
Return list of cc, colls_to_display, colls_to_search since the list
of collections to display is different from that to search in.
This is because users might have chosen 'split by collection'
functionality.
The behaviour of "collections to display" depends solely whether
user has deselected a particular collection: e.g. if it started
from 'Articles and Preprints' page, and deselected 'Preprints',
then collection to display is 'Articles'. If he did not deselect
anything, then collection to display is 'Articles & Preprints'.
The behaviour of "collections to search in" depends on the
'split_colls' parameter:
* if is equal to 1, then we can wash the colls list down
and search solely in the collection the user started from;
* if is equal to 0, then we are splitting to the first level
of collections, i.e. collections as they appear on the page
we started to search from;
The function raises exception
InvenioWebSearchUnknownCollectionError
if cc or one of c collections is not known.
"""
colls_out = []
colls_out_for_display = []
# list to hold the hosted collections to be searched and displayed
hosted_colls_out = []
debug = ""
if verbose:
debug += " "
debug += " 1) --- initial parameters ---"
debug += " cc : %s" % cc
debug += " c : %s" % c
debug += " "
# check what type is 'cc':
if type(cc) is list:
for ci in cc:
if collection_reclist_cache.cache.has_key(ci):
# yes this collection is real, so use it:
cc = ci
break
else:
# check once if cc is real:
if not collection_reclist_cache.cache.has_key(cc):
if cc:
raise InvenioWebSearchUnknownCollectionError(cc)
else:
cc = CFG_SITE_NAME # cc is not set, so replace it with Home collection
# check type of 'c' argument:
if type(c) is list:
colls = c
else:
colls = [c]
if verbose:
debug += " 2) --- after check for the integrity of cc and the being or not c a list ---"
debug += " cc : %s" % cc
debug += " c : %s" % c
debug += " "
# remove all 'unreal' collections:
colls_real = []
for coll in colls:
if collection_reclist_cache.cache.has_key(coll):
colls_real.append(coll)
else:
if coll:
raise InvenioWebSearchUnknownCollectionError(coll)
colls = colls_real
if verbose:
debug += " 3) --- keeping only the real colls of c ---"
debug += " colls : %s" % colls
debug += " "
# check if some real collections remain:
if len(colls)==0:
colls = [cc]
if verbose:
debug += " 4) --- in case no colls were left we use cc directly ---"
debug += " colls : %s" % colls
debug += " "
# then let us check the list of non-restricted "real" sons of 'cc' and compare it to 'coll':
res = run_sql("""SELECT c.name FROM collection AS c,
collection_collection AS cc,
collection AS ccc
WHERE c.id=cc.id_son AND cc.id_dad=ccc.id
AND ccc.name=%s AND cc.type='r'""", (cc,))
# list that holds all the non restricted sons of cc that are also not hosted collections
l_cc_nonrestricted_sons_and_nonhosted_colls = []
res_hosted = run_sql("""SELECT c.name FROM collection AS c,
collection_collection AS cc,
collection AS ccc
WHERE c.id=cc.id_son AND cc.id_dad=ccc.id
AND ccc.name=%s AND cc.type='r'
AND (c.dbquery NOT LIKE 'hostedcollection:%%' OR c.dbquery IS NULL)""", (cc,))
for row_hosted in res_hosted:
l_cc_nonrestricted_sons_and_nonhosted_colls.append(row_hosted[0])
l_cc_nonrestricted_sons_and_nonhosted_colls.sort()
l_cc_nonrestricted_sons = []
l_c = colls
for row in res:
if not collection_restricted_p(row[0]):
l_cc_nonrestricted_sons.append(row[0])
l_c.sort()
l_cc_nonrestricted_sons.sort()
if l_cc_nonrestricted_sons == l_c:
colls_out_for_display = [cc] # yep, washing permitted, it is sufficient to display 'cc'
# the following elif is a hack that preserves the above funcionality when we start searching from
# the frontpage with some hosted collections deselected (either by default or manually)
elif set(l_cc_nonrestricted_sons_and_nonhosted_colls).issubset(set(l_c)):
colls_out_for_display = colls
split_colls = 0
else:
colls_out_for_display = colls # nope, we need to display all 'colls' successively
# remove duplicates:
#colls_out_for_display_nondups=filter(lambda x, colls_out_for_display=colls_out_for_display: colls_out_for_display[x-1] not in colls_out_for_display[x:], range(1, len(colls_out_for_display)+1))
#colls_out_for_display = map(lambda x, colls_out_for_display=colls_out_for_display:colls_out_for_display[x-1], colls_out_for_display_nondups)
colls_out_for_display = list(set(colls_out_for_display))
if verbose:
debug += " 5) --- decide whether colls_out_for_diplay should be colls or is it sufficient for it to be cc; remove duplicates ---"
debug += " colls_out_for_display : %s" % colls_out_for_display
debug += " "
# the following piece of code takes care of removing collections whose ancestors are going to be searched anyway
# list to hold the collections to be removed
colls_to_be_removed = []
# first calculate the collections that can safely be removed
for coll in colls_out_for_display:
for ancestor in get_coll_ancestors(coll):
#if ancestor in colls_out_for_display: colls_to_be_removed.append(coll)
if ancestor in colls_out_for_display and not is_hosted_collection(coll): colls_to_be_removed.append(coll)
# secondly remove the collections
for coll in colls_to_be_removed:
colls_out_for_display.remove(coll)
if verbose:
debug += " 6) --- remove collections that have ancestors about to be search, unless they are hosted ---"
debug += " colls_out_for_display : %s" % colls_out_for_display
debug += " "
# calculate the hosted collections to be searched.
if colls_out_for_display == [cc]:
if is_hosted_collection(cc):
hosted_colls_out.append(cc)
else:
for coll in get_coll_sons(cc):
if is_hosted_collection(coll):
hosted_colls_out.append(coll)
else:
for coll in colls_out_for_display:
if is_hosted_collection(coll):
hosted_colls_out.append(coll)
if verbose:
debug += " 7) --- calculate the hosted_colls_out ---"
debug += " hosted_colls_out : %s" % hosted_colls_out
debug += " "
# second, let us decide on collection splitting:
if split_colls == 0:
# type A - no sons are wanted
colls_out = colls_out_for_display
else:
# type B - sons (first-level descendants) are wanted
for coll in colls_out_for_display:
coll_sons = get_coll_sons(coll)
if coll_sons == []:
colls_out.append(coll)
else:
for coll_son in coll_sons:
if not is_hosted_collection(coll_son):
colls_out.append(coll_son)
#else:
# colls_out = colls_out + coll_sons
# remove duplicates:
#colls_out_nondups=filter(lambda x, colls_out=colls_out: colls_out[x-1] not in colls_out[x:], range(1, len(colls_out)+1))
#colls_out = map(lambda x, colls_out=colls_out:colls_out[x-1], colls_out_nondups)
colls_out = list(set(colls_out))
if verbose:
debug += " 8) --- calculate the colls_out; remove duplicates ---"
debug += " colls_out : %s" % colls_out
debug += " "
# remove the hosted collections from the collections to be searched
if hosted_colls_out:
for coll in hosted_colls_out:
try:
colls_out.remove(coll)
except ValueError:
# in case coll was not found in colls_out
pass
if verbose:
debug += " 9) --- remove the hosted_colls from the colls_out ---"
debug += " colls_out : %s" % colls_out
return (cc, colls_out_for_display, colls_out, hosted_colls_out, debug)
def strip_accents(x):
"""Strip accents in the input phrase X (assumed in UTF-8) by replacing
accented characters with their unaccented cousins (e.g. é by e).
Return such a stripped X."""
x = re_latex_lowercase_a.sub("a", x)
x = re_latex_lowercase_ae.sub("ae", x)
x = re_latex_lowercase_e.sub("e", x)
x = re_latex_lowercase_i.sub("i", x)
x = re_latex_lowercase_o.sub("o", x)
x = re_latex_lowercase_u.sub("u", x)
x = re_latex_lowercase_y.sub("x", x)
x = re_latex_lowercase_c.sub("c", x)
x = re_latex_lowercase_n.sub("n", x)
x = re_latex_uppercase_a.sub("A", x)
x = re_latex_uppercase_ae.sub("AE", x)
x = re_latex_uppercase_e.sub("E", x)
x = re_latex_uppercase_i.sub("I", x)
x = re_latex_uppercase_o.sub("O", x)
x = re_latex_uppercase_u.sub("U", x)
x = re_latex_uppercase_y.sub("Y", x)
x = re_latex_uppercase_c.sub("C", x)
x = re_latex_uppercase_n.sub("N", x)
# convert input into Unicode string:
try:
y = unicode(x, "utf-8")
except:
return x # something went wrong, probably the input wasn't UTF-8
# asciify Latin-1 lowercase characters:
y = re_unicode_lowercase_a.sub("a", y)
y = re_unicode_lowercase_ae.sub("ae", y)
y = re_unicode_lowercase_e.sub("e", y)
y = re_unicode_lowercase_i.sub("i", y)
y = re_unicode_lowercase_o.sub("o", y)
y = re_unicode_lowercase_u.sub("u", y)
y = re_unicode_lowercase_y.sub("y", y)
y = re_unicode_lowercase_c.sub("c", y)
y = re_unicode_lowercase_n.sub("n", y)
# asciify Latin-1 uppercase characters:
y = re_unicode_uppercase_a.sub("A", y)
y = re_unicode_uppercase_ae.sub("AE", y)
y = re_unicode_uppercase_e.sub("E", y)
y = re_unicode_uppercase_i.sub("I", y)
y = re_unicode_uppercase_o.sub("O", y)
y = re_unicode_uppercase_u.sub("U", y)
y = re_unicode_uppercase_y.sub("Y", y)
y = re_unicode_uppercase_c.sub("C", y)
y = re_unicode_uppercase_n.sub("N", y)
# return UTF-8 representation of the Unicode string:
return y.encode("utf-8")
def wash_index_term(term, max_char_length=50, lower_term=True):
"""
Return washed form of the index term TERM that would be suitable
for storing into idxWORD* tables. I.e., lower the TERM if
LOWER_TERM is True, and truncate it safely to MAX_CHAR_LENGTH
UTF-8 characters (meaning, in principle, 4*MAX_CHAR_LENGTH bytes).
The function works by an internal conversion of TERM, when needed,
from its input Python UTF-8 binary string format into Python
Unicode format, and then truncating it safely to the given number
of UTF-8 characters, without possible mis-truncation in the middle
of a multi-byte UTF-8 character that could otherwise happen if we
would have been working with UTF-8 binary representation directly.
Note that MAX_CHAR_LENGTH corresponds to the length of the term
column in idxINDEX* tables.
"""
if lower_term:
washed_term = unicode(term, 'utf-8').lower()
else:
washed_term = unicode(term, 'utf-8')
if len(washed_term) <= max_char_length:
# no need to truncate the term, because it will fit
# nicely even if it uses four-byte UTF-8 characters
return washed_term.encode('utf-8')
else:
# truncate the term in a safe position:
return washed_term[:max_char_length].encode('utf-8')
def lower_index_term(term):
"""
Return safely lowered index term TERM. This is done by converting
to UTF-8 first, because standard Python lower() function is not
UTF-8 safe. To be called by both the search engine and the
indexer when appropriate (e.g. before stemming).
In case of problems with UTF-8 compliance, this function raises
UnicodeDecodeError, so the client code may want to catch it.
"""
return unicode(term, 'utf-8').lower().encode('utf-8')
def wash_output_format(format):
"""Wash output format FORMAT. Currently only prevents input like
'of=9' for backwards-compatible format that prints certain fields
only. (for this task, 'of=tm' is preferred)"""
if str(format[0:3]).isdigit() and len(format) != 6:
# asked to print MARC tags, but not enough digits,
# so let's switch back to HTML brief default
return 'hb'
else:
return format
def wash_pattern(p):
"""Wash pattern passed by URL. Check for sanity of the wildcard by
removing wildcards if they are appended to extremely short words
(1-3 letters). TODO: instead of this approximative treatment, it
will be much better to introduce a temporal limit, e.g. to kill a
query if it does not finish in 10 seconds."""
# strip accents:
# p = strip_accents(p) # FIXME: when available, strip accents all the time
# add leading/trailing whitespace for the two following wildcard-sanity checking regexps:
p = " " + p + " "
# get rid of wildcards at the beginning of words:
p = re_pattern_wildcards_at_beginning.sub("\\1", p)
# replace spaces within quotes by __SPACE__ temporarily:
p = re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", p)
p = re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", p)
p = re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", p)
# get rid of extremely short words (1-3 letters with wildcards):
p = re_pattern_short_words.sub("\\1", p)
# replace back __SPACE__ by spaces:
p = re_pattern_space.sub(" ", p)
# replace special terms:
p = re_pattern_today.sub(time.strftime("%Y-%m-%d", time.localtime()), p)
# remove unnecessary whitespace:
p = string.strip(p)
return p
def wash_field(f):
"""Wash field passed by URL."""
# get rid of unnecessary whitespace:
f = string.strip(f)
# wash old-style CDS Invenio/ALEPH 'f' field argument, e.g. replaces 'wau' and 'au' by 'author'
if CFG_WEBSEARCH_FIELDS_CONVERT.has_key(string.lower(f)):
f = CFG_WEBSEARCH_FIELDS_CONVERT[f]
return f
def wash_dates(d1="", d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0):
"""
Take user-submitted date arguments D1 (full datetime string) or
(D1Y, D1M, D1Y) year, month, day tuple and D2 or (D2Y, D2M, D2Y)
and return (YYY1-M1-D2 H1:M1:S2, YYY2-M2-D2 H2:M2:S2) datetime
strings in the YYYY-MM-DD HH:MM:SS format suitable for time
restricted searching.
Note that when both D1 and (D1Y, D1M, D1D) parameters are present,
the precedence goes to D1. Ditto for D2*.
Note that when (D1Y, D1M, D1D) are taken into account, some values
may be missing and are completed e.g. to 01 or 12 according to
whether it is the starting or the ending date.
"""
datetext1, datetext2 = "", ""
# sanity checking:
if d1 == "" and d1y == 0 and d1m == 0 and d1d == 0 and d2 == "" and d2y == 0 and d2m == 0 and d2d == 0:
return ("", "") # nothing selected, so return empty values
# wash first (starting) date:
if d1:
# full datetime string takes precedence:
datetext1 = d1
else:
# okay, first date passed as (year,month,day):
if d1y:
datetext1 += "%04d" % d1y
else:
datetext1 += "0000"
if d1m:
datetext1 += "-%02d" % d1m
else:
datetext1 += "-01"
if d1d:
datetext1 += "-%02d" % d1d
else:
datetext1 += "-01"
datetext1 += " 00:00:00"
# wash second (ending) date:
if d2:
# full datetime string takes precedence:
datetext2 = d2
else:
# okay, second date passed as (year,month,day):
if d2y:
datetext2 += "%04d" % d2y
else:
datetext2 += "9999"
if d2m:
datetext2 += "-%02d" % d2m
else:
datetext2 += "-12"
if d2d:
datetext2 += "-%02d" % d2d
else:
datetext2 += "-31" # NOTE: perhaps we should add max(datenumber) in
# given month, but for our quering it's not
# needed, 31 will always do
datetext2 += " 00:00:00"
# okay, return constructed YYYY-MM-DD HH:MM:SS datetexts:
return (datetext1, datetext2)
def is_hosted_collection(coll):
"""Check if the given collection is a hosted one; i.e. its dbquery starts with hostedcollection:
Returns True if it is, False if it's not or if the result is empty or if the query failed"""
res = run_sql("SELECT dbquery FROM collection WHERE name=%s", (coll, ))
try:
return res[0][0].startswith("hostedcollection:")
except:
return False
def get_colID(c):
"Return collection ID for collection name C. Return None if no match found."
colID = None
res = run_sql("SELECT id FROM collection WHERE name=%s", (c,), 1)
if res:
colID = res[0][0]
return colID
def get_coll_ancestors(coll):
"Returns a list of ancestors for collection 'coll'."
coll_ancestors = []
coll_ancestor = coll
while 1:
res = run_sql("""SELECT c.name FROM collection AS c
LEFT JOIN collection_collection AS cc ON c.id=cc.id_dad
LEFT JOIN collection AS ccc ON ccc.id=cc.id_son
WHERE ccc.name=%s ORDER BY cc.id_dad ASC LIMIT 1""",
(coll_ancestor,))
if res:
coll_name = res[0][0]
coll_ancestors.append(coll_name)
coll_ancestor = coll_name
else:
break
# ancestors found, return reversed list:
coll_ancestors.reverse()
return coll_ancestors
def get_coll_sons(coll, type='r', public_only=1):
"""Return a list of sons (first-level descendants) of type 'type' for collection 'coll'.
If public_only, then return only non-restricted son collections.
"""
coll_sons = []
query = "SELECT c.name FROM collection AS c "\
"LEFT JOIN collection_collection AS cc ON c.id=cc.id_son "\
"LEFT JOIN collection AS ccc ON ccc.id=cc.id_dad "\
"WHERE cc.type=%s AND ccc.name=%s"
query += " ORDER BY cc.score DESC"
res = run_sql(query, (type, coll))
for name in res:
if not public_only or not collection_restricted_p(name[0]):
coll_sons.append(name[0])
return coll_sons
def get_coll_real_descendants(coll, type='_', get_hosted_colls=True):
"""Return a list of all descendants of collection 'coll' that are defined by a 'dbquery'.
IOW, we need to decompose compound collections like "A & B" into "A" and "B" provided
that "A & B" has no associated database query defined.
"""
coll_sons = []
res = run_sql("""SELECT c.name,c.dbquery FROM collection AS c
LEFT JOIN collection_collection AS cc ON c.id=cc.id_son
LEFT JOIN collection AS ccc ON ccc.id=cc.id_dad
WHERE ccc.name=%s AND cc.type LIKE %s ORDER BY cc.score DESC""",
(coll, type,))
for name, dbquery in res:
if dbquery: # this is 'real' collection, so return it:
if get_hosted_colls:
coll_sons.append(name)
else:
if not dbquery.startswith("hostedcollection:"):
coll_sons.append(name)
else: # this is 'composed' collection, so recurse:
coll_sons.extend(get_coll_real_descendants(name))
return coll_sons
def browse_pattern(req, colls, p, f, rg, ln=CFG_SITE_LANG):
"""Browse either biliographic phrases or words indexes, and display it."""
# load the right message language
_ = gettext_set_language(ln)
## is p enclosed in quotes? (coming from exact search)
if p.startswith('"') and p.endswith('"'):
p = p[1:-1]
p_orig = p
## okay, "real browse" follows:
## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test
if not f and string.find(p, ":") > 0: # does 'p' contain ':'?
f, p = string.split(p, ":", 1)
## do we search in words indexes?
if not f:
return browse_in_bibwords(req, p, f)
index_id = get_index_id_from_field(f)
if index_id != 0:
coll = HitSet()
for coll_name in colls:
coll |= get_collection_reclist(coll_name)
browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection(p, index_id, rg/2, rg/2, coll)
else:
browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg+1)/2+1, (rg-1)/2+1)
while not browsed_phrases:
# try again and again with shorter and shorter pattern:
try:
p = p[:-1]
browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg+1)/2+1, (rg-1)/2+1)
except:
# probably there are no hits at all:
req.write(_("No values found."))
return
## try to check hits in these particular collection selection:
browsed_phrases_in_colls = []
if 0:
for phrase in browsed_phrases:
phrase_hitset = HitSet()
phrase_hitsets = search_pattern("", phrase, f, 'e')
for coll in colls:
phrase_hitset.union_update(phrase_hitsets[coll])
if len(phrase_hitset) > 0:
# okay, this phrase has some hits in colls, so add it:
browsed_phrases_in_colls.append([phrase, len(phrase_hitset)])
## were there hits in collections?
if browsed_phrases_in_colls == []:
if browsed_phrases != []:
#print_warning(req, """
No match close to %s found in given collections.
#Please try different term.
Displaying matches in any collection...""" % p_orig)
## try to get nbhits for these phrases in any collection:
for phrase in browsed_phrases:
browsed_phrases_in_colls.append([phrase, get_nbhits_in_bibxxx(phrase, f)])
## display results now:
out = websearch_templates.tmpl_browse_pattern(
f=f,
fn=get_field_i18nname(get_field_name(f) or f, ln, False),
ln=ln,
browsed_phrases_in_colls=browsed_phrases_in_colls,
colls=colls,
rg=rg,
)
req.write(out)
return
def browse_in_bibwords(req, p, f, ln=CFG_SITE_LANG):
"""Browse inside words indexes."""
if not p:
return
_ = gettext_set_language(ln)
urlargd = {}
urlargd.update(req.argd)
urlargd['action'] = 'search'
nearest_box = create_nearest_terms_box(urlargd, p, f, 'w', ln=ln, intro_text_p=0)
req.write(websearch_templates.tmpl_search_in_bibwords(
p = p,
f = f,
ln = ln,
nearest_box = nearest_box
))
return
def search_pattern(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0, ln=CFG_SITE_LANG, display_nearest_terms_box=True):
"""Search for complex pattern 'p' within field 'f' according to
matching type 'm'. Return hitset of recIDs.
The function uses multi-stage searching algorithm in case of no
exact match found. See the Search Internals document for
detailed description.
The 'ap' argument governs whether an alternative patterns are to
be used in case there is no direct hit for (p,f,m). For
example, whether to replace non-alphanumeric characters by
spaces if it would give some hits. See the Search Internals
document for detailed description. (ap=0 forbits the
alternative pattern usage, ap=1 permits it.)
The 'of' argument governs whether to print or not some
information to the user in case of no match found. (Usually it
prints the information in case of HTML formats, otherwise it's
silent).
The 'verbose' argument controls the level of debugging information
to be printed (0=least, 9=most).
All the parameters are assumed to have been previously washed.
This function is suitable as a mid-level API.
"""
_ = gettext_set_language(ln)
hitset_empty = HitSet()
# sanity check:
if not p:
hitset_full = HitSet(trailing_bits=1)
hitset_full.discard(0)
# no pattern, so return all universe
return hitset_full
# search stage 1: break up arguments into basic search units:
if verbose and of.startswith("h"):
t1 = os.times()[4]
basic_search_units = create_basic_search_units(req, p, f, m, of)
if verbose and of.startswith("h"):
t2 = os.times()[4]
print_warning(req, "Search stage 1: basic search units are: %s" % cgi.escape(repr(basic_search_units)))
print_warning(req, "Search stage 1: execution took %.2f seconds." % (t2 - t1))
# search stage 2: do search for each search unit and verify hit presence:
if verbose and of.startswith("h"):
t1 = os.times()[4]
basic_search_units_hitsets = []
+ #prepare hiddenfield-related..
+ myhiddens = CFG_BIBFORMAT_HIDDEN_TAGS
+ can_see_hidden = False
+ if req:
+ user_info = collect_user_info(req)
+ can_see_hidden = (acc_authorize_action(user_info, 'runbibedit')[0] == 0)
+ if can_see_hidden:
+ myhiddens = []
+
for idx_unit in xrange(len(basic_search_units)):
bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit]
basic_search_unit_hitset = search_unit(bsu_p, bsu_f, bsu_m)
+ #check that the user is allowed to search with this tag..
+ for htag in myhiddens:
+ ltag = len(htag)
+ samelenfield = bsu_f[0:ltag]
+ if samelenfield == htag:
+ #we won't show you anything, user
+ basic_search_unit_hitset = HitSet()
+ if verbose >= 9 and of.startswith("h"):
+ print_warning(req, "Pattern %s hitlist omitted since it queries a hidden tag in %s" %
+ basic_search_unit_hitset, str(myhiddens))
+
if verbose >= 9 and of.startswith("h"):
print_warning(req, "Search stage 1: pattern %s gave hitlist %s" % (cgi.escape(bsu_p), basic_search_unit_hitset))
if len(basic_search_unit_hitset) > 0 or \
ap==0 or \
bsu_o=="|" or \
((idx_unit+1) 0:
# we retain the new unit instead
if of.startswith('h'):
print_warning(req, _("No exact match found for %(x_query1)s, using %(x_query2)s instead...") % \
{'x_query1': "" + cgi.escape(bsu_p) + "",
'x_query2': "" + cgi.escape(bsu_pn) + ""})
basic_search_units[idx_unit][1] = bsu_pn
basic_search_units_hitsets.append(basic_search_unit_hitset)
else:
# stage 2-3: no hits found either, propose nearest indexed terms:
if of.startswith('h') and display_nearest_terms_box:
if req:
if bsu_f == "recid":
print_warning(req, "Requested record does not seem to exist.")
else:
print_warning(req, create_nearest_terms_box(req.argd, bsu_p, bsu_f, bsu_m, ln=ln))
return hitset_empty
else:
# stage 2-3: no hits found either, propose nearest indexed terms:
if of.startswith('h') and display_nearest_terms_box:
if req:
if bsu_f == "recid":
print_warning(req, "Requested record does not seem to exist.")
else:
print_warning(req, create_nearest_terms_box(req.argd, bsu_p, bsu_f, bsu_m, ln=ln))
return hitset_empty
if verbose and of.startswith("h"):
t2 = os.times()[4]
for idx_unit in range(0, len(basic_search_units)):
print_warning(req, "Search stage 2: basic search unit %s gave %d hits." %
(basic_search_units[idx_unit][1:], len(basic_search_units_hitsets[idx_unit])))
print_warning(req, "Search stage 2: execution took %.2f seconds." % (t2 - t1))
# search stage 3: apply boolean query for each search unit:
if verbose and of.startswith("h"):
t1 = os.times()[4]
# let the initial set be the complete universe:
hitset_in_any_collection = HitSet(trailing_bits=1)
hitset_in_any_collection.discard(0)
for idx_unit in xrange(len(basic_search_units)):
this_unit_operation = basic_search_units[idx_unit][0]
this_unit_hitset = basic_search_units_hitsets[idx_unit]
if this_unit_operation == '+':
hitset_in_any_collection.intersection_update(this_unit_hitset)
elif this_unit_operation == '-':
hitset_in_any_collection.difference_update(this_unit_hitset)
elif this_unit_operation == '|':
hitset_in_any_collection.union_update(this_unit_hitset)
else:
if of.startswith("h"):
print_warning(req, "Invalid set operation %s." % cgi.escape(this_unit_operation), "Error")
if len(hitset_in_any_collection) == 0:
# no hits found, propose alternative boolean query:
if of.startswith('h') and display_nearest_terms_box:
nearestterms = []
for idx_unit in range(0, len(basic_search_units)):
bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit]
if bsu_p.startswith("%") and bsu_p.endswith("%"):
bsu_p = "'" + bsu_p[1:-1] + "'"
bsu_nbhits = len(basic_search_units_hitsets[idx_unit])
# create a similar query, but with the basic search unit only
argd = {}
argd.update(req.argd)
argd['p'] = bsu_p
argd['f'] = bsu_f
nearestterms.append((bsu_p, bsu_nbhits, argd))
text = websearch_templates.tmpl_search_no_boolean_hits(
ln=ln, nearestterms=nearestterms)
print_warning(req, text)
if verbose and of.startswith("h"):
t2 = os.times()[4]
print_warning(req, "Search stage 3: boolean query gave %d hits." % len(hitset_in_any_collection))
print_warning(req, "Search stage 3: execution took %.2f seconds." % (t2 - t1))
return hitset_in_any_collection
def search_pattern_parenthesised(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0, ln=CFG_SITE_LANG, display_nearest_terms_box=True):
"""Search for complex pattern 'p' containing parenthesis within field 'f' according to
matching type 'm'. Return hitset of recIDs.
For more details on the parameters see 'search_pattern'
"""
_ = gettext_set_language(ln)
# if the pattern uses SPIRES search syntax, convert it to Invenio syntax
spires_syntax_converter = SpiresToInvenioSyntaxConverter()
p = spires_syntax_converter.convert_query(p)
# sanity check: do not call parenthesised parser for search terms
# like U(1):
if not re_pattern_parens.search(p):
return search_pattern(req, p, f, m, ap, of, verbose, ln, display_nearest_terms_box=display_nearest_terms_box)
# Try searching with parentheses
try:
parser = SearchQueryParenthesisedParser()
# get a hitset with all recids
result_hitset = HitSet(trailing_bits=1)
# parse the query. The result is list of [op1, expr1, op2, expr2, ..., opN, exprN]
parsing_result = parser.parse_query(p)
if verbose and of.startswith("h"):
print_warning(req, "Search stage 1: search_pattern_parenthesised() returned %s." % repr(parsing_result))
# go through every pattern
# calculate hitset for it
# combine pattern's hitset with the result using the corresponding operator
for index in xrange(0, len(parsing_result)-1, 2 ):
current_operator = parsing_result[index]
current_pattern = parsing_result[index+1]
# obtain a hitset for the current pattern
current_hitset = search_pattern(req, current_pattern, f, m, ap, of, verbose, ln, display_nearest_terms_box=display_nearest_terms_box)
# combine the current hitset with resulting hitset using the current operator
if current_operator == '+':
result_hitset = result_hitset & current_hitset
elif current_operator == '-':
result_hitset = result_hitset - current_hitset
elif current_operator == '|':
result_hitset = result_hitset | current_hitset
else:
assert False, "Unknown operator in search_pattern_parenthesised()"
return result_hitset
# If searching with parenteses fails, perform search ignoring parentheses
except InvenioWebSearchQueryParserException:
print_warning(req, _("Nested or mismatched parentheses detected. Ignoring all parentheses in the query..."))
# remove the parentheses in the query. Current implementation removes all the parentheses,
# but it could be improved to romove only these that are not insede quotes
p = p.replace('(', ' ')
p = p.replace(')', ' ')
return search_pattern(req, p, f, m, ap, of, verbose, ln, display_nearest_terms_box=display_nearest_terms_box)
def search_unit(p, f=None, m=None):
"""Search for basic search unit defined by pattern 'p' and field
'f' and matching type 'm'. Return hitset of recIDs.
All the parameters are assumed to have been previously washed.
'p' is assumed to be already a ``basic search unit'' so that it
is searched as such and is not broken up in any way. Only
wildcard and span queries are being detected inside 'p'.
This function is suitable as a low-level API.
"""
## create empty output results set:
set = HitSet()
if not p: # sanity checking
return set
if m == 'a' or m == 'r':
# we are doing either phrase search or regexp search
index_id = get_index_id_from_field(f)
if index_id != 0:
set = search_unit_in_idxphrases(p, f, m)
else:
set = search_unit_in_bibxxx(p, f, m)
elif p.startswith("cited:"):
# we are doing search by the citation count
set = search_unit_by_times_cited(p[6:])
else:
# we are doing bibwords search by default
set = search_unit_in_bibwords(p, f)
return set
def search_unit_in_bibwords(word, f, decompress=zlib.decompress):
"""Searches for 'word' inside bibwordsX table for field 'f' and returns hitset of recIDs."""
set = HitSet() # will hold output result set
set_used = 0 # not-yet-used flag, to be able to circumvent set operations
# deduce into which bibwordsX table we will search:
stemming_language = get_index_stemming_language(get_index_id_from_field("anyfield"))
bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield")
if f:
index_id = get_index_id_from_field(f)
if index_id:
bibwordsX = "idxWORD%02dF" % index_id
stemming_language = get_index_stemming_language(index_id)
else:
return HitSet() # word index f does not exist
# wash 'word' argument and run query:
word = string.replace(word, '*', '%') # we now use '*' as the truncation character
words = string.split(word, "->", 1) # check for span query
if len(words) == 2:
word0 = re_word.sub('', words[0])
word1 = re_word.sub('', words[1])
if stemming_language:
word0 = lower_index_term(word0)
word1 = lower_index_term(word1)
word0 = stem(word0, stemming_language)
word1 = stem(word1, stemming_language)
res = run_sql("SELECT term,hitlist FROM %s WHERE term BETWEEN %%s AND %%s" % bibwordsX,
(wash_index_term(word0), wash_index_term(word1)))
else:
if f == 'journal':
pass # FIXME: quick hack for the journal index
else:
word = re_word.sub('', word)
if stemming_language:
word = lower_index_term(word)
word = stem(word, stemming_language)
if string.find(word, '%') >= 0: # do we have wildcard in the word?
if f == 'journal':
# FIXME: quick hack for the journal index
# FIXME: we can run a sanity check here for all indexes
res = ()
else:
res = run_sql("SELECT term,hitlist FROM %s WHERE term LIKE %%s" % bibwordsX,
(wash_index_term(word),))
else:
res = run_sql("SELECT term,hitlist FROM %s WHERE term=%%s" % bibwordsX,
(wash_index_term(word),))
# fill the result set:
for word, hitlist in res:
hitset_bibwrd = HitSet(hitlist)
# add the results:
if set_used:
set.union_update(hitset_bibwrd)
else:
set = hitset_bibwrd
set_used = 1
# okay, return result set:
return set
def search_unit_in_idxphrases(p, f, type):
"""Searches for phrase 'p' inside idxPHRASE*F table for field 'f' and returns hitset of recIDs found.
The search type is defined by 'type' (e.g. equals to 'r' for a regexp search)."""
set = HitSet() # will hold output result set
set_used = 0 # not-yet-used flag, to be able to circumvent set operations
# deduce in which idxPHRASE table we will search:
idxphraseX = "idxPHRASE%02dF" % get_index_id_from_field("anyfield")
if f:
index_id = get_index_id_from_field(f)
if index_id:
idxphraseX = "idxPHRASE%02dF" % index_id
else:
return HitSet() # phrase index f does not exist
# detect query type (exact phrase, partial phrase, regexp):
if type == 'r':
query_addons = "REGEXP %s"
query_params = (p,)
else:
p = string.replace(p, '*', '%') # we now use '*' as the truncation character
ps = string.split(p, "->", 1) # check for span query:
if len(ps) == 2:
query_addons = "BETWEEN %s AND %s"
query_params = (ps[0], ps[1])
else:
if string.find(p, '%') > -1:
query_addons = "LIKE %s"
query_params = (ps[0],)
else:
query_addons = "= %s"
query_params = (ps[0],)
# perform search:
res = run_sql("SELECT term,hitlist FROM %s WHERE term %s" % (idxphraseX, query_addons),
query_params)
# fill the result set:
for word, hitlist in res:
hitset_bibphrase = HitSet(hitlist)
# add the results:
if set_used:
set.union_update(hitset_bibphrase)
else:
set = hitset_bibphrase
set_used = 1
# okay, return result set:
return set
def search_unit_in_bibxxx(p, f, type):
"""Searches for pattern 'p' inside bibxxx tables for field 'f' and returns hitset of recIDs found.
The search type is defined by 'type' (e.g. equals to 'r' for a regexp search)."""
# FIXME: quick hack for the journal index
if f == 'journal':
return search_unit_in_bibwords(p, f)
p_orig = p # saving for eventual future 'no match' reporting
query_addons = "" # will hold additional SQL code for the query
query_params = () # will hold parameters for the query (their number may vary depending on TYPE argument)
# wash arguments:
f = string.replace(f, '*', '%') # replace truncation char '*' in field definition
if type == 'r':
query_addons = "REGEXP %s"
query_params = (p,)
else:
p = string.replace(p, '*', '%') # we now use '*' as the truncation character
ps = string.split(p, "->", 1) # check for span query:
if len(ps) == 2:
query_addons = "BETWEEN %s AND %s"
query_params = (ps[0], ps[1])
else:
if string.find(p, '%') > -1:
query_addons = "LIKE %s"
query_params = (ps[0],)
else:
query_addons = "= %s"
query_params = (ps[0],)
# construct 'tl' which defines the tag list (MARC tags) to search in:
tl = []
if str(f[0]).isdigit() and str(f[1]).isdigit():
tl.append(f) # 'f' seems to be okay as it starts by two digits
else:
# convert old ALEPH tag names, if appropriate: (TODO: get rid of this before entering this function)
if CFG_WEBSEARCH_FIELDS_CONVERT.has_key(string.lower(f)):
f = CFG_WEBSEARCH_FIELDS_CONVERT[string.lower(f)]
# deduce desired MARC tags on the basis of chosen 'f'
tl = get_field_tags(f)
if not tl:
# f index does not exist, nevermind
pass
# okay, start search:
l = [] # will hold list of recID that matched
for t in tl:
# deduce into which bibxxx table we will search:
digit1, digit2 = int(t[0]), int(t[1])
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
# construct and run query:
if t == "001":
res = run_sql("SELECT id FROM bibrec WHERE id %s" % query_addons,
query_params)
else:
query = "SELECT bibx.id_bibrec FROM %s AS bx LEFT JOIN %s AS bibx ON bx.id=bibx.id_bibxxx WHERE bx.value %s" % \
(bx, bibx, query_addons)
if len(t) != 6 or t[-1:]=='%':
# wildcard query, or only the beginning of field 't'
# is defined, so add wildcard character:
query += " AND bx.tag LIKE %s"
res = run_sql(query, query_params + (t + '%',))
else:
# exact query for 't':
query += " AND bx.tag=%s"
res = run_sql(query, query_params + (t,))
# fill the result set:
for id_bibrec in res:
if id_bibrec[0]:
l.append(id_bibrec[0])
# check no of hits found:
nb_hits = len(l)
# okay, return result set:
set = HitSet(l)
return set
def search_unit_in_bibrec(datetext1, datetext2, type='c'):
"""
Return hitset of recIDs found that were either created or modified
(according to 'type' arg being 'c' or 'm') from datetext1 until datetext2, inclusive.
Does not pay attention to pattern, collection, anything. Useful
to intersect later on with the 'real' query.
"""
set = HitSet()
if type.startswith("m"):
type = "modification_date"
else:
type = "creation_date" # by default we are searching for creation dates
res = run_sql("SELECT id FROM bibrec WHERE %s>=%%s AND %s<=%%s" % (type, type),
(datetext1, datetext2))
for row in res:
set += row[0]
return set
def search_unit_by_times_cited(p):
"""
Return histset of recIDs found that are cited P times.
Usually P looks like '10->23'.
"""
numstr = '"'+p+'"'
#this is sort of stupid but since we may need to
#get the records that do _not_ have cites, we have to
#know the ids of all records, too
#but this is needed only if bsu_p is 0 or 0 or 0->0
allrecs = []
if p == 0 or p == "0" or \
p.startswith("0->") or p.endswith("->0"):
allrecs = HitSet(run_sql_cached("SELECT id FROM bibrec", affected_tables=['bibrec']))
return get_records_with_num_cites(numstr, allrecs)
def intersect_results_with_collrecs(req, hitset_in_any_collection, colls, ap=0, of="hb", verbose=0, ln=CFG_SITE_LANG, display_nearest_terms_box=True):
"""Return dict of hitsets given by intersection of hitset with the collection universes."""
_ = gettext_set_language(ln)
# search stage 4: intersect with the collection universe:
if verbose and of.startswith("h"):
t1 = os.times()[4]
results = {}
results_nbhits = 0
for coll in colls:
results[coll] = hitset_in_any_collection & get_collection_reclist(coll)
results_nbhits += len(results[coll])
if results_nbhits == 0:
# no hits found, try to search in Home:
results_in_Home = hitset_in_any_collection & get_collection_reclist(CFG_SITE_NAME)
if len(results_in_Home) > 0:
# some hits found in Home, so propose this search:
if of.startswith("h") and display_nearest_terms_box:
url = websearch_templates.build_search_url(req.argd, cc=CFG_SITE_NAME, c=[])
print_warning(req, _("No match found in collection %(x_collection)s. Other public collections gave %(x_url_open)s%(x_nb_hits)d hits%(x_url_close)s.") %\
{'x_collection': '' + string.join([get_coll_i18nname(coll, ln, False) for coll in colls], ', ') + '',
'x_url_open': '' % (url),
'x_nb_hits': len(results_in_Home),
'x_url_close': ''})
results = {}
else:
# no hits found in Home, recommend different search terms:
if of.startswith("h") and display_nearest_terms_box:
print_warning(req, _("No public collection matched your query. "
"If you were looking for a non-public document, please choose "
"the desired restricted collection first."))
results = {}
if verbose and of.startswith("h"):
t2 = os.times()[4]
print_warning(req, "Search stage 4: intersecting with collection universe gave %d hits." % results_nbhits)
print_warning(req, "Search stage 4: execution took %.2f seconds." % (t2 - t1))
return results
def intersect_results_with_hitset(req, results, hitset, ap=0, aptext="", of="hb"):
"""Return intersection of search 'results' (a dict of hitsets
with collection as key) with the 'hitset', i.e. apply
'hitset' intersection to each collection within search
'results'.
If the final 'results' set is to be empty, and 'ap'
(approximate pattern) is true, and then print the `warningtext'
and return the original 'results' set unchanged. If 'ap' is
false, then return empty results set.
"""
if ap:
results_ap = copy.deepcopy(results)
else:
results_ap = {} # will return empty dict in case of no hits found
nb_total = 0
for coll in results.keys():
results[coll].intersection_update(hitset)
nb_total += len(results[coll])
if nb_total == 0:
if of.startswith("h"):
print_warning(req, aptext)
results = results_ap
return results
def create_similarly_named_authors_link_box(author_name, ln=CFG_SITE_LANG):
"""Return a box similar to ``Not satisfied...'' one by proposing
author searches for similar names. Namely, take AUTHOR_NAME
and the first initial of the firstame (after comma) and look
into author index whether authors with e.g. middle names exist.
Useful mainly for CERN Library that sometimes contains name
forms like Ellis-N, Ellis-Nick, Ellis-Nicolas all denoting the
same person. The box isn't proposed if no similarly named
authors are found to exist.
"""
# return nothing if not configured:
if CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX == 0:
return ""
# return empty box if there is no initial:
if re.match(r'[^ ,]+, [^ ]', author_name) is None:
return ""
# firstly find name comma initial:
author_name_to_search = re.sub(r'^([^ ,]+, +[^ ,]).*$', '\\1', author_name)
# secondly search for similar name forms:
similar_author_names = {}
for name in author_name_to_search, strip_accents(author_name_to_search):
for tag in get_field_tags("author"):
# deduce into which bibxxx table we will search:
digit1, digit2 = int(tag[0]), int(tag[1])
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
if len(tag) != 6 or tag[-1:]=='%':
# only the beginning of field 't' is defined, so add wildcard character:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value LIKE %%s AND bx.tag LIKE %%s""" % bx,
(name + "%", tag + "%"))
else:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value LIKE %%s AND bx.tag=%%s""" % bx,
(name + "%", tag))
for row in res:
similar_author_names[row[0]] = 1
# remove the original name and sort the list:
try:
del similar_author_names[author_name]
except KeyError:
pass
# thirdly print the box:
out = ""
if similar_author_names:
out_authors = similar_author_names.keys()
out_authors.sort()
tmp_authors = []
for out_author in out_authors:
nbhits = get_nbhits_in_bibxxx(out_author, "author")
if nbhits:
tmp_authors.append((out_author, nbhits))
out += websearch_templates.tmpl_similar_author_names(
authors=tmp_authors, ln=ln)
return out
def create_nearest_terms_box(urlargd, p, f, t='w', n=5, ln=CFG_SITE_LANG, intro_text_p=True):
"""Return text box containing list of 'n' nearest terms above/below 'p'
for the field 'f' for matching type 't' (words/phrases) in
language 'ln'.
Propose new searches according to `urlargs' with the new words.
If `intro_text_p' is true, then display the introductory message,
otherwise print only the nearest terms in the box content.
"""
# load the right message language
_ = gettext_set_language(ln)
out = ""
nearest_terms = []
if not p: # sanity check
p = "."
index_id = get_index_id_from_field(f)
# look for nearest terms:
if t == 'w':
nearest_terms = get_nearest_terms_in_bibwords(p, f, n, n)
if not nearest_terms:
return _("No word index is available for %s.") % \
('' + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + '')
else:
nearest_terms = []
if index_id:
nearest_terms = get_nearest_terms_in_idxphrase(p, index_id, n, n)
if not nearest_terms:
nearest_terms = get_nearest_terms_in_bibxxx(p, f, n, n)
if not nearest_terms:
return _("No phrase index is available for %s.") % \
('' + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + '')
terminfo = []
for term in nearest_terms:
if t == 'w':
hits = get_nbhits_in_bibwords(term, f)
else:
if index_id:
hits = get_nbhits_in_idxphrases(term, f)
else:
hits = get_nbhits_in_bibxxx(term, f)
argd = {}
argd.update(urlargd)
# check which fields contained the requested parameter, and replace it.
for (px, fx) in ('p', 'f'), ('p1', 'f1'), ('p2', 'f2'), ('p3', 'f3'):
if px in argd:
argd_px = argd[px]
if t == 'w':
# p was stripped of accents, to do the same:
argd_px = strip_accents(argd_px)
if f == argd[fx] or f == "anyfield" or f == "":
if string.find(argd_px, p) > -1:
argd[px] = string.replace(argd_px, p, term)
break
else:
if string.find(argd_px, f+':'+p) > -1:
argd[px] = string.replace(argd_px, f+':'+p, f+':'+term)
break
elif string.find(argd_px, f+':"'+p+'"') > -1:
argd[px] = string.replace(argd_px, f+':"'+p+'"', f+':"'+term+'"')
break
terminfo.append((term, hits, argd))
intro = ""
if intro_text_p: # add full leading introductory text
if f:
intro = _("Search term %(x_term)s inside index %(x_index)s did not match any record. Nearest terms in any collection are:") % \
{'x_term': "" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "",
'x_index': "" + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + ""}
else:
intro = _("Search term %s did not match any record. Nearest terms in any collection are:") % \
("" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "")
return websearch_templates.tmpl_nearest_term_box(p=p, ln=ln, f=f, terminfo=terminfo,
intro=intro)
def get_nearest_terms_in_bibwords(p, f, n_below, n_above):
"""Return list of +n -n nearest terms to word `p' in index for field `f'."""
nearest_words = [] # will hold the (sorted) list of nearest words to return
# deduce into which bibwordsX table we will search:
bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield")
if f:
index_id = get_index_id_from_field(f)
if index_id:
bibwordsX = "idxWORD%02dF" % index_id
else:
return nearest_words
# firstly try to get `n' closest words above `p':
res = run_sql("SELECT term FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % bibwordsX,
(p, n_above))
for row in res:
nearest_words.append(row[0])
nearest_words.reverse()
# secondly insert given word `p':
nearest_words.append(p)
# finally try to get `n' closest words below `p':
res = run_sql("SELECT term FROM %s WHERE term>%%s ORDER BY term ASC LIMIT %%s" % bibwordsX,
(p, n_below))
for row in res:
nearest_words.append(row[0])
return nearest_words
def get_nearest_terms_in_idxphrase(p, index_id, n_below, n_above):
"""Browse (-n_above, +n_below) closest bibliographic phrases
for the given pattern p in the given field idxPHRASE table,
regardless of collection.
Return list of [phrase1, phrase2, ... , phrase_n]."""
idxphraseX = "idxPHRASE%02dF" % index_id
res_above = run_sql("SELECT term FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % idxphraseX, (p, n_above))
res_above = map(lambda x: x[0], res_above)
res_above.reverse()
res_below = run_sql("SELECT term FROM %s WHERE term>=%%s ORDER BY term ASC LIMIT %%s" % idxphraseX, (p, n_below))
res_below = map(lambda x: x[0], res_below)
return res_above + res_below
def get_nearest_terms_in_idxphrase_with_collection(p, index_id, n_below, n_above, collection):
"""Browse (-n_above, +n_below) closest bibliographic phrases
for the given pattern p in the given field idxPHRASE table,
considering the collection (HitSet).
Return list of [(phrase1, hitset), (phrase2, hitset), ... , (phrase_n, hitset)]."""
idxphraseX = "idxPHRASE%02dF" % index_id
res_above = run_sql("SELECT term,hitlist FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % idxphraseX, (p, n_above * 3))
res_above = [(term, HitSet(hitlist) & collection) for term, hitlist in res_above]
res_above = [(term, len(hitlist)) for term, hitlist in res_above if hitlist]
res_below = run_sql("SELECT term,hitlist FROM %s WHERE term>=%%s ORDER BY term ASC LIMIT %%s" % idxphraseX, (p, n_below * 3))
res_below = [(term, HitSet(hitlist) & collection) for term, hitlist in res_below]
res_below = [(term, len(hitlist)) for term, hitlist in res_below if hitlist]
res_above.reverse()
return res_above[-n_above:] + res_below[:n_below]
def get_nearest_terms_in_bibxxx(p, f, n_below, n_above):
"""Browse (-n_above, +n_below) closest bibliographic phrases
for the given pattern p in the given field f, regardless
of collection.
Return list of [phrase1, phrase2, ... , phrase_n]."""
## determine browse field:
if not f and string.find(p, ":") > 0: # does 'p' contain ':'?
f, p = string.split(p, ":", 1)
# FIXME: quick hack for the journal index
if f == 'journal':
return get_nearest_terms_in_bibwords(p, f, n_below, n_above)
## We are going to take max(n_below, n_above) as the number of
## values to ferch from bibXXx. This is needed to work around
## MySQL UTF-8 sorting troubles in 4.0.x. Proper solution is to
## use MySQL 4.1.x or our own idxPHRASE in the future.
index_id = get_index_id_from_field(f)
if index_id:
return get_nearest_terms_in_idxphrase(p, index_id, n_below, n_above)
n_fetch = 2*max(n_below, n_above)
## construct 'tl' which defines the tag list (MARC tags) to search in:
tl = []
if str(f[0]).isdigit() and str(f[1]).isdigit():
tl.append(f) # 'f' seems to be okay as it starts by two digits
else:
# deduce desired MARC tags on the basis of chosen 'f'
tl = get_field_tags(f)
## start browsing to fetch list of hits:
browsed_phrases = {} # will hold {phrase1: 1, phrase2: 1, ..., phraseN: 1} dict of browsed phrases (to make them unique)
# always add self to the results set:
browsed_phrases[p.startswith("%") and p.endswith("%") and p[1:-1] or p] = 1
for t in tl:
# deduce into which bibxxx table we will search:
digit1, digit2 = int(t[0]), int(t[1])
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
# firstly try to get `n' closest phrases above `p':
if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value<%%s AND bx.tag LIKE %%s
ORDER BY bx.value DESC LIMIT %%s""" % bx,
(p, t + "%", n_fetch))
else:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value<%%s AND bx.tag=%%s
ORDER BY bx.value DESC LIMIT %%s""" % bx,
(p, t, n_fetch))
for row in res:
browsed_phrases[row[0]] = 1
# secondly try to get `n' closest phrases equal to or below `p':
if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value>=%%s AND bx.tag LIKE %%s
ORDER BY bx.value ASC LIMIT %%s""" % bx,
(p, t + "%", n_fetch))
else:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value>=%%s AND bx.tag=%%s
ORDER BY bx.value ASC LIMIT %%s""" % bx,
(p, t, n_fetch))
for row in res:
browsed_phrases[row[0]] = 1
# select first n words only: (this is needed as we were searching
# in many different tables and so aren't sure we have more than n
# words right; this of course won't be needed when we shall have
# one ACC table only for given field):
phrases_out = browsed_phrases.keys()
phrases_out.sort(lambda x, y: cmp(string.lower(strip_accents(x)),
string.lower(strip_accents(y))))
# find position of self:
try:
idx_p = phrases_out.index(p)
except:
idx_p = len(phrases_out)/2
# return n_above and n_below:
return phrases_out[max(0, idx_p-n_above):idx_p+n_below]
def get_nbhits_in_bibwords(word, f):
"""Return number of hits for word 'word' inside words index for field 'f'."""
out = 0
# deduce into which bibwordsX table we will search:
bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield")
if f:
index_id = get_index_id_from_field(f)
if index_id:
bibwordsX = "idxWORD%02dF" % index_id
else:
return 0
if word:
res = run_sql("SELECT hitlist FROM %s WHERE term=%%s" % bibwordsX,
(word,))
for hitlist in res:
out += len(HitSet(hitlist[0]))
return out
def get_nbhits_in_idxphrases(word, f):
"""Return number of hits for word 'word' inside phrase index for field 'f'."""
out = 0
# deduce into which bibwordsX table we will search:
idxphraseX = "idxPHRASE%02dF" % get_index_id_from_field("anyfield")
if f:
index_id = get_index_id_from_field(f)
if index_id:
idxphraseX = "idxPHRASE%02dF" % index_id
else:
return 0
if word:
res = run_sql("SELECT hitlist FROM %s WHERE term=%%s" % idxphraseX,
(word,))
for hitlist in res:
out += len(HitSet(hitlist[0]))
return out
def get_nbhits_in_bibxxx(p, f):
"""Return number of hits for word 'word' inside words index for field 'f'."""
## determine browse field:
if not f and string.find(p, ":") > 0: # does 'p' contain ':'?
f, p = string.split(p, ":", 1)
# FIXME: quick hack for the journal index
if f == 'journal':
return get_nbhits_in_bibwords(p, f)
## construct 'tl' which defines the tag list (MARC tags) to search in:
tl = []
if str(f[0]).isdigit() and str(f[1]).isdigit():
tl.append(f) # 'f' seems to be okay as it starts by two digits
else:
# deduce desired MARC tags on the basis of chosen 'f'
tl = get_field_tags(f)
# start searching:
recIDs = {} # will hold dict of {recID1: 1, recID2: 1, ..., } (unique recIDs, therefore)
for t in tl:
# deduce into which bibxxx table we will search:
digit1, digit2 = int(t[0]), int(t[1])
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character:
res = run_sql("""SELECT bibx.id_bibrec FROM %s AS bibx, %s AS bx
WHERE bx.value=%%s AND bx.tag LIKE %%s
AND bibx.id_bibxxx=bx.id""" % (bibx, bx),
(p, t + "%"))
else:
res = run_sql("""SELECT bibx.id_bibrec FROM %s AS bibx, %s AS bx
WHERE bx.value=%%s AND bx.tag=%%s
AND bibx.id_bibxxx=bx.id""" % (bibx, bx),
(p, t))
for row in res:
recIDs[row[0]] = 1
return len(recIDs)
def get_mysql_recid_from_aleph_sysno(sysno):
"""Returns DB's recID for ALEPH sysno passed in the argument (e.g. "002379334CER").
Returns None in case of failure."""
out = None
res = run_sql("""SELECT bb.id_bibrec FROM bibrec_bib97x AS bb, bib97x AS b
WHERE b.value=%s AND b.tag='970__a' AND bb.id_bibxxx=b.id""",
(sysno,))
if res:
out = res[0][0]
return out
def guess_primary_collection_of_a_record(recID):
"""Return primary collection name a record recid belongs to, by
testing 980 identifier.
May lead to bad guesses when a collection is defined dynamically
via dbquery.
In that case, return 'CFG_SITE_NAME'."""
out = CFG_SITE_NAME
dbcollids = get_fieldvalues(recID, "980__a")
if dbcollids:
dbquery = "collection:" + dbcollids[0]
res = run_sql("SELECT name FROM collection WHERE dbquery=%s", (dbquery,))
if res:
out = res[0][0]
if CFG_CERN_SITE:
# dirty hack for ATLAS collections at CERN:
if out in ('ATLAS Communications', 'ATLAS Internal Notes'):
for alternative_collection in ('ATLAS Communications Physics',
'ATLAS Communications General',
'ATLAS Internal Notes Physics',
'ATLAS Internal Notes General',):
if recID in get_collection_reclist(alternative_collection):
out = alternative_collection
break
return out
_re_collection_url = re.compile('/collection/(.+)')
def guess_collection_of_a_record(recID, referer=None):
"""Return collection name a record recid belongs to, by first testing
the referer URL if provided and otherwise returning the
primary collection."""
if referer:
dummy, hostname, path, dummy, query, dummy = urlparse.urlparse(referer)
g = _re_collection_url.match(path)
if g:
name = urllib.unquote_plus(g.group(1))
if recID in get_collection_reclist(name):
return name
elif path.startswith('/search'):
query = cgi.parse_qs(query)
for name in query.get('cc', []) + query.get('c', []):
if recID in get_collection_reclist(name):
return name
return guess_primary_collection_of_a_record(recID)
def get_all_collections_of_a_record(recID):
"""Return all the collection names a record belongs to.
Note this function is O(n_collections)."""
ret = []
for name in collection_reclist_cache.cache.keys():
if recID in get_collection_reclist(name):
ret.append(name)
return ret
def get_tag_name(tag_value, prolog="", epilog=""):
"""Return tag name from the known tag value, by looking up the 'tag' table.
Return empty string in case of failure.
Example: input='100__%', output=first author'."""
out = ""
res = run_sql_cached("SELECT name FROM tag WHERE value=%s", (tag_value,),
affected_tables=['tag',])
if res:
out = prolog + res[0][0] + epilog
return out
def get_fieldcodes():
"""Returns a list of field codes that may have been passed as 'search options' in URL.
Example: output=['subject','division']."""
out = []
res = run_sql_cached("SELECT DISTINCT(code) FROM field",
affected_tables=['field',])
for row in res:
out.append(row[0])
return out
def get_field_name(code):
"""Return the corresponding field_name given the field code.
e.g. reportnumber -> report number."""
res = run_sql_cached("SELECT name FROM field WHERE code=%s", (code, ),
affected_tables=['field',])
if res:
return res[0][0]
else:
return ""
def get_field_tags(field):
"""Returns a list of MARC tags for the field code 'field'.
Returns empty list in case of error.
Example: field='author', output=['100__%','700__%']."""
out = []
query = """SELECT t.value FROM tag AS t, field_tag AS ft, field AS f
WHERE f.code=%s AND ft.id_field=f.id AND t.id=ft.id_tag
ORDER BY ft.score DESC"""
res = run_sql(query, (field, ))
for val in res:
out.append(val[0])
return out
def get_fieldvalues(recIDs, tag, repetitive_values=True):
"""
Return list of field values for field TAG for the given record ID
or list of record IDs. (RECIDS can be both an integer or a list
of integers.)
If REPETITIVE_VALUES is set to True, then return all values even
if they are doubled. If set to False, then return unique values
only.
"""
out = []
if isinstance(recIDs, (int, long)):
recIDs =[recIDs,]
if not isinstance(recIDs, (list, tuple)):
return []
if len(recIDs) == 0:
return []
if tag == "001___":
# we have asked for tag 001 (=recID) that is not stored in bibXXx tables
out = [str(recID) for recID in recIDs]
else:
# we are going to look inside bibXXx tables
digits = tag[0:2]
try:
intdigits = int(digits)
if intdigits < 0 or intdigits > 99:
raise ValueError
except ValueError:
# invalid tag value asked for
return []
bx = "bib%sx" % digits
bibx = "bibrec_bib%sx" % digits
queryparam = []
for recID in recIDs:
queryparam.append(recID)
if not repetitive_values:
queryselect = "DISTINCT(bx.value)"
else:
queryselect = "bx.value"
query = "SELECT %s FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec IN (%s) " \
" AND bx.id=bibx.id_bibxxx AND bx.tag LIKE %%s " \
" ORDER BY bibx.field_number, bx.tag ASC" % \
(queryselect, bx, bibx, ("%s,"*len(queryparam))[:-1])
res = run_sql(query, tuple(queryparam) + (tag,))
for row in res:
out.append(row[0])
return out
-def get_fieldvalues_alephseq_like(recID, tags_in):
- """Return buffer of ALEPH sequential-like textual format with fields found in the list TAGS_IN for record RECID."""
+def get_fieldvalues_alephseq_like(recID, tags_in, can_see_hidden=False):
+ """Return buffer of ALEPH sequential-like textual format with fields found
+ in the list TAGS_IN for record RECID.
+
+ If can_see_hidden is True, just print everything. Otherwise hide fields
+ from CFG_BIBFORMAT_HIDDEN_TAGS.
+ """
+
out = ""
if type(tags_in) is not list:
tags_in = [tags_in,]
if len(tags_in) == 1 and len(tags_in[0]) == 6:
## case A: one concrete subfield asked, so print its value if found
- ## (use with care: can false you if field has multiple occurrences)
+ ## (use with care: can mislead if field has multiple occurrences)
out += string.join(get_fieldvalues(recID, tags_in[0]),"\n")
else:
## case B: print our "text MARC" format; works safely all the time
# find out which tags to output:
dict_of_tags_out = {}
if not tags_in:
for i in range(0, 10):
for j in range(0, 10):
dict_of_tags_out["%d%d%%" % (i, j)] = 1
else:
for tag in tags_in:
if len(tag) == 0:
for i in range(0, 10):
for j in range(0, 10):
dict_of_tags_out["%d%d%%" % (i, j)] = 1
elif len(tag) == 1:
for j in range(0, 10):
dict_of_tags_out["%s%d%%" % (tag, j)] = 1
elif len(tag) < 5:
dict_of_tags_out["%s%%" % tag] = 1
elif tag >= 6:
dict_of_tags_out[tag[0:5]] = 1
tags_out = dict_of_tags_out.keys()
tags_out.sort()
# search all bibXXx tables as needed:
for tag in tags_out:
digits = tag[0:2]
try:
intdigits = int(digits)
if intdigits < 0 or intdigits > 99:
raise ValueError
except ValueError:
# invalid tag value asked for
continue
if tag.startswith("001") or tag.startswith("00%"):
if out:
out += "\n"
out += "%09d %s %d" % (recID, "001__", recID)
bx = "bib%sx" % digits
bibx = "bibrec_bib%sx" % digits
query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\
"WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s"\
"ORDER BY bb.field_number, b.tag ASC" % (bx, bibx)
res = run_sql(query, (recID, str(tag)+'%'))
# go through fields:
field_number_old = -999
field_old = ""
for row in res:
field, value, field_number = row[0], row[1], row[2]
ind1, ind2 = field[3], field[4]
+ printme = True
+ #check the stuff in hiddenfields
+ if not can_see_hidden:
+ for htag in CFG_BIBFORMAT_HIDDEN_TAGS:
+ ltag = len(htag)
+ samelenfield = field[0:ltag]
+ if samelenfield == htag:
+ printme = False
if ind1 == "_":
ind1 = ""
if ind2 == "_":
ind2 = ""
# print field tag
- if field_number != field_number_old or field[:-1] != field_old[:-1]:
- if out:
- out += "\n"
- out += "%09d %s " % (recID, field[:5])
- field_number_old = field_number
- field_old = field
- # print subfield value
- if field[0:2] == "00" and field[-1:] == "_":
- out += value
- else:
- out += "$$%s%s" % (field[-1:], value)
+ if printme:
+ if field_number != field_number_old or field[:-1] != field_old[:-1]:
+ if out:
+ out += "\n"
+ out += "%09d %s " % (recID, field[:5])
+ field_number_old = field_number
+ field_old = field
+ # print subfield value
+ if field[0:2] == "00" and field[-1:] == "_":
+ out += value
+ else:
+ out += "$$%s%s" % (field[-1:], value)
return out
def record_exists(recID):
"""Return 1 if record RECID exists.
Return 0 if it doesn't exist.
Return -1 if it exists but is marked as deleted.
"""
out = 0
res = run_sql("SELECT id FROM bibrec WHERE id=%s", (recID,), 1)
if res:
recID = int(recID)
# record exists; now check whether it isn't marked as deleted:
dbcollids = get_fieldvalues(recID, "980__%")
if ("DELETED" in dbcollids) or (CFG_CERN_SITE and "DUMMY" in dbcollids):
out = -1 # exists, but marked as deleted
else:
out = 1 # exists fine
return out
def record_empty(recID):
"""
Is this record empty, e.g. has only 001, waiting for integration?
@param recID: the record identifier.
@type recID: int
@return: 1 if the record is empty, 0 otherwise.
@rtype: int
"""
record = get_record(recID)
if record is None or len(record) < 2:
return 1
else:
return 0
def record_public_p(recID):
"""Return 1 if the record is public, i.e. if it can be found in the Home collection.
Return 0 otherwise.
"""
return recID in get_collection_reclist(CFG_SITE_NAME)
def get_creation_date(recID, fmt="%Y-%m-%d"):
"Returns the creation date of the record 'recID'."
out = ""
res = run_sql("SELECT DATE_FORMAT(creation_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1)
if res:
out = res[0][0]
return out
def get_modification_date(recID, fmt="%Y-%m-%d"):
"Returns the date of last modification for the record 'recID'."
out = ""
res = run_sql("SELECT DATE_FORMAT(modification_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1)
if res:
out = res[0][0]
return out
def print_warning(req, msg, type='', prologue=' ', epilogue=' '):
"Prints warning message and flushes output."
if req and msg:
req.write(websearch_templates.tmpl_print_warning(
msg = msg,
type = type,
prologue = prologue,
epilogue = epilogue,
))
return
def print_search_info(p, f, sf, so, sp, rm, of, ot, collection=CFG_SITE_NAME, nb_found=-1, jrec=1, rg=10,
aas=0, ln=CFG_SITE_LANG, p1="", p2="", p3="", f1="", f2="", f3="", m1="", m2="", m3="", op1="", op2="",
sc=1, pl_in_url="",
d1y=0, d1m=0, d1d=0, d2y=0, d2m=0, d2d=0, dt="",
cpu_time=-1, middle_only=0):
"""Prints stripe with the information on 'collection' and 'nb_found' results and CPU time.
Also, prints navigation links (beg/next/prev/end) inside the results set.
If middle_only is set to 1, it will only print the middle box information (beg/netx/prev/end/etc) links.
This is suitable for displaying navigation links at the bottom of the search results page."""
out = ""
# sanity check:
if jrec < 1:
jrec = 1
if jrec > nb_found:
jrec = max(nb_found-rg+1, 1)
return websearch_templates.tmpl_print_search_info(
ln = ln,
collection = collection,
aas = aas,
collection_name = get_coll_i18nname(collection, ln, False),
collection_id = get_colID(collection),
middle_only = middle_only,
rg = rg,
nb_found = nb_found,
sf = sf,
so = so,
rm = rm,
of = of,
ot = ot,
p = p,
f = f,
p1 = p1,
p2 = p2,
p3 = p3,
f1 = f1,
f2 = f2,
f3 = f3,
m1 = m1,
m2 = m2,
m3 = m3,
op1 = op1,
op2 = op2,
pl_in_url = pl_in_url,
d1y = d1y,
d1m = d1m,
d1d = d1d,
d2y = d2y,
d2m = d2m,
d2d = d2d,
dt = dt,
jrec = jrec,
sc = sc,
sp = sp,
all_fieldcodes = get_fieldcodes(),
cpu_time = cpu_time,
)
def print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, collection=CFG_SITE_NAME, nb_found=-1, jrec=1, rg=10,
aas=0, ln=CFG_SITE_LANG, p1="", p2="", p3="", f1="", f2="", f3="", m1="", m2="", m3="", op1="", op2="",
sc=1, pl_in_url="",
d1y=0, d1m=0, d1d=0, d2y=0, d2m=0, d2d=0, dt="",
cpu_time=-1, middle_only=0):
"""Prints stripe with the information on 'collection' and 'nb_found' results and CPU time.
Also, prints navigation links (beg/next/prev/end) inside the results set.
If middle_only is set to 1, it will only print the middle box information (beg/netx/prev/end/etc) links.
This is suitable for displaying navigation links at the bottom of the search results page."""
out = ""
# sanity check:
if jrec < 1:
jrec = 1
if jrec > nb_found:
jrec = max(nb_found-rg+1, 1)
return websearch_templates.tmpl_print_hosted_search_info(
ln = ln,
collection = collection,
aas = aas,
collection_name = get_coll_i18nname(collection, ln, False),
collection_id = get_colID(collection),
middle_only = middle_only,
rg = rg,
nb_found = nb_found,
sf = sf,
so = so,
rm = rm,
of = of,
ot = ot,
p = p,
f = f,
p1 = p1,
p2 = p2,
p3 = p3,
f1 = f1,
f2 = f2,
f3 = f3,
m1 = m1,
m2 = m2,
m3 = m3,
op1 = op1,
op2 = op2,
pl_in_url = pl_in_url,
d1y = d1y,
d1m = d1m,
d1d = d1d,
d2y = d2y,
d2m = d2m,
d2d = d2d,
dt = dt,
jrec = jrec,
sc = sc,
sp = sp,
all_fieldcodes = get_fieldcodes(),
cpu_time = cpu_time,
)
def print_results_overview(req, colls, results_final_nb_total, results_final_nb, cpu_time, ln=CFG_SITE_LANG, ec=[], hosted_colls_potential_results_p=False):
"""Prints results overview box with links to particular collections below."""
out = ""
new_colls = []
for coll in colls:
new_colls.append({
'id': get_colID(coll),
'code': coll,
'name': get_coll_i18nname(coll, ln, False),
})
return websearch_templates.tmpl_print_results_overview(
ln = ln,
results_final_nb_total = results_final_nb_total,
results_final_nb = results_final_nb,
cpu_time = cpu_time,
colls = new_colls,
ec = ec,
hosted_colls_potential_results_p = hosted_colls_potential_results_p,
)
def print_hosted_results(url_and_engine, ln=CFG_SITE_LANG, of=None, req=None, no_records_found=False, search_timed_out=False, limit=CFG_EXTERNAL_COLLECTION_MAXRESULTS):
"""Prints the full results of a hosted collection"""
if of.startswith("h"):
if no_records_found:
return " No results found."
if search_timed_out:
return " The search engine did not respond in time."
return websearch_templates.tmpl_print_hosted_results(
url_and_engine=url_and_engine,
ln=ln,
of=of,
req=req,
limit=limit
)
def sort_records(req, recIDs, sort_field='', sort_order='d', sort_pattern='', verbose=0, of='hb', ln=CFG_SITE_LANG):
"""Sort records in 'recIDs' list according sort field 'sort_field' in order 'sort_order'.
If more than one instance of 'sort_field' is found for a given record, try to choose that that is given by
'sort pattern', for example "sort by report number that starts by CERN-PS".
Note that 'sort_field' can be field code like 'author' or MARC tag like '100__a' directly."""
_ = gettext_set_language(ln)
## check arguments:
if not sort_field:
return recIDs
if len(recIDs) > CFG_WEBSEARCH_NB_RECORDS_TO_SORT:
if of.startswith('h'):
print_warning(req, _("Sorry, sorting is allowed on sets of up to %d records only. Using default sort order.") % CFG_WEBSEARCH_NB_RECORDS_TO_SORT, "Warning")
return recIDs
sort_fields = string.split(sort_field, ",")
recIDs_dict = {}
recIDs_out = []
## first deduce sorting MARC tag out of the 'sort_field' argument:
tags = []
for sort_field in sort_fields:
if sort_field and str(sort_field[0:2]).isdigit():
# sort_field starts by two digits, so this is probably a MARC tag already
tags.append(sort_field)
else:
# let us check the 'field' table
query = """SELECT DISTINCT(t.value) FROM tag AS t, field_tag AS ft, field AS f
WHERE f.code=%s AND ft.id_field=f.id AND t.id=ft.id_tag
ORDER BY ft.score DESC"""
res = run_sql(query, (sort_field, ))
if res:
for row in res:
tags.append(row[0])
else:
if of.startswith('h'):
print_warning(req, _("Sorry, %s does not seem to be a valid sort option. Choosing title sort instead.") % cgi.escape(sort_field), "Error")
tags.append("245__a")
if verbose >= 3:
print_warning(req, "Sorting by tags %s." % cgi.escape(repr(tags)))
if sort_pattern:
print_warning(req, "Sorting preferentially by %s." % cgi.escape(sort_pattern))
## check if we have sorting tag defined:
if tags:
# fetch the necessary field values:
for recID in recIDs:
val = "" # will hold value for recID according to which sort
vals = [] # will hold all values found in sorting tag for recID
for tag in tags:
vals.extend(get_fieldvalues(recID, tag))
if sort_pattern:
# try to pick that tag value that corresponds to sort pattern
bingo = 0
for v in vals:
if v.lower().startswith(sort_pattern.lower()): # bingo!
bingo = 1
val = v
break
if not bingo: # sort_pattern not present, so add other vals after spaces
val = sort_pattern + " " + string.join(vals)
else:
# no sort pattern defined, so join them all together
val = string.join(vals)
val = strip_accents(val.lower()) # sort values regardless of accents and case
if recIDs_dict.has_key(val):
recIDs_dict[val].append(recID)
else:
recIDs_dict[val] = [recID]
# sort them:
recIDs_dict_keys = recIDs_dict.keys()
recIDs_dict_keys.sort()
# now that keys are sorted, create output array:
for k in recIDs_dict_keys:
for s in recIDs_dict[k]:
recIDs_out.append(s)
# ascending or descending?
if sort_order == 'a':
recIDs_out.reverse()
# okay, we are done
return recIDs_out
else:
# good, no sort needed
return recIDs
def print_records(req, recIDs, jrec=1, rg=10, format='hb', ot='', ln=CFG_SITE_LANG, relevances=[], relevances_prologue="(", relevances_epilogue="%%)", decompress=zlib.decompress, search_pattern='', print_records_prologue_p=True, print_records_epilogue_p=True, verbose=0, tab=''):
"""
Prints list of records 'recIDs' formatted according to 'format' in
groups of 'rg' starting from 'jrec'.
Assumes that the input list 'recIDs' is sorted in reverse order,
so it counts records from tail to head.
A value of 'rg=-9999' means to print all records: to be used with care.
Print also list of RELEVANCES for each record (if defined), in
between RELEVANCE_PROLOGUE and RELEVANCE_EPILOGUE.
Print prologue and/or epilogue specific to 'format' if
'print_records_prologue_p' and/or print_records_epilogue_p' are
True.
"""
# load the right message language
_ = gettext_set_language(ln)
# sanity checking:
if req is None:
return
# get user_info (for formatting based on user)
if isinstance(req, cStringIO.OutputType):
user_info = {}
else:
user_info = collect_user_info(req)
if len(recIDs):
nb_found = len(recIDs)
if rg == -9999: # print all records
rg = nb_found
else:
rg = abs(rg)
if jrec < 1: # sanity checks
jrec = 1
if jrec > nb_found:
jrec = max(nb_found-rg+1, 1)
# will print records from irec_max to irec_min excluded:
irec_max = nb_found - jrec
irec_min = nb_found - jrec - rg
if irec_min < 0:
irec_min = -1
if irec_max >= nb_found:
irec_max = nb_found - 1
#req.write("%s:%d-%d" % (recIDs, irec_min, irec_max))
if format.startswith('x'):
# print header if needed
if print_records_prologue_p:
print_records_prologue(req, format)
# print records
recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)]
+
format_records(recIDs_to_print,
format,
ln=ln,
search_pattern=search_pattern,
record_separator="\n",
user_info=user_info,
req=req)
# print footer if needed
if print_records_epilogue_p:
print_records_epilogue(req, format)
elif format.startswith('t') or str(format[0:3]).isdigit():
# we are doing plain text output:
for irec in range(irec_max, irec_min, -1):
x = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
req.write(x)
if x:
req.write('\n')
elif format == 'excel':
recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)]
create_excel(recIDs=recIDs_to_print, req=req, ln=ln, ot=ot)
else:
# we are doing HTML output:
if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"):
# portfolio and on-the-fly formats:
for irec in range(irec_max, irec_min, -1):
req.write(print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose))
elif format.startswith("hb"):
# HTML brief format:
display_add_to_basket = True
if user_info and not user_info['precached_usebaskets']:
display_add_to_basket = False
req.write(websearch_templates.tmpl_record_format_htmlbrief_header(
ln = ln))
for irec in range(irec_max, irec_min, -1):
row_number = jrec+irec_max-irec
recid = recIDs[irec]
if relevances and relevances[irec]:
relevance = relevances[irec]
else:
relevance = ''
record = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
req.write(websearch_templates.tmpl_record_format_htmlbrief_body(
ln = ln,
recid = recid,
row_number = row_number,
relevance = relevance,
record = record,
relevances_prologue = relevances_prologue,
relevances_epilogue = relevances_epilogue,
display_add_to_basket = display_add_to_basket
))
req.write(websearch_templates.tmpl_record_format_htmlbrief_footer(
ln = ln,
display_add_to_basket = display_add_to_basket))
elif format.startswith("hd"):
# HTML detailed format:
for irec in range(irec_max, irec_min, -1):
unordered_tabs = get_detailed_page_tabs(get_colID(guess_primary_collection_of_a_record(recIDs[irec])),
recIDs[irec], ln=ln)
ordered_tabs_id = [(tab_id, values['order']) for (tab_id, values) in unordered_tabs.iteritems()]
ordered_tabs_id.sort(lambda x,y: cmp(x[1],y[1]))
link_ln = ''
if ln != CFG_SITE_LANG:
link_ln = '?ln=%s' % ln
if CFG_WEBSEARCH_USE_ALEPH_SYSNOS:
recid_to_display = get_fieldvalues(recIDs[irec], CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG)[0]
else:
recid_to_display = recIDs[irec]
tabs = [(unordered_tabs[tab_id]['label'], \
'%s/record/%s/%s%s' % (CFG_SITE_URL, recid_to_display, tab_id, link_ln), \
tab_id == tab,
unordered_tabs[tab_id]['enabled']) \
for (tab_id, order) in ordered_tabs_id
if unordered_tabs[tab_id]['visible'] == True]
content = ''
# load content
if tab == 'usage':
req.write(webstyle_templates.detailed_record_container_top(recIDs[irec],
tabs,
ln))
r = calculate_reading_similarity_list(recIDs[irec], "downloads")
downloadsimilarity = None
downloadhistory = None
#if r:
# downloadsimilarity = r
if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS:
downloadhistory = create_download_history_graph_and_box(recIDs[irec], ln)
r = calculate_reading_similarity_list(recIDs[irec], "pageviews")
viewsimilarity = None
if r: viewsimilarity = r
content = websearch_templates.tmpl_detailed_record_statistics(recIDs[irec],
ln,
downloadsimilarity=downloadsimilarity,
downloadhistory=downloadhistory,
viewsimilarity=viewsimilarity)
req.write(content)
req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec],
tabs,
ln))
elif tab == 'citations':
recid = recIDs[irec]
req.write(webstyle_templates.detailed_record_container_top(recid,
tabs,
ln))
req.write(websearch_templates.tmpl_detailed_record_citations_prologue(recid, ln))
# Citing
citinglist = []
r = calculate_cited_by_list(recid)
if r:
citinglist = r
req.write(websearch_templates.tmpl_detailed_record_citations_citing_list(recid,
ln,
citinglist=citinglist))
# Self-cited
selfcited = get_self_cited_by(recid)
req.write(websearch_templates.tmpl_detailed_record_citations_self_cited(recid,
ln, selfcited=selfcited, citinglist=citinglist))
# Co-cited
s = calculate_co_cited_with_list(recid)
cociting = None
if s:
cociting = s
req.write(websearch_templates.tmpl_detailed_record_citations_co_citing(recid,
ln,
cociting=cociting))
# Citation history
citationhistory = None
if r:
citationhistory = create_citation_history_graph_and_box(recid, ln)
#debug
if verbose > 3:
print_warning(req, "Citation graph debug: "+str(len(citationhistory)))
req.write(websearch_templates.tmpl_detailed_record_citations_citation_history(recid, ln, citationhistory))
req.write(websearch_templates.tmpl_detailed_record_citations_epilogue(recid, ln))
req.write(webstyle_templates.detailed_record_container_bottom(recid,
tabs,
ln))
elif tab == 'references':
req.write(webstyle_templates.detailed_record_container_top(recIDs[irec],
tabs,
ln))
req.write(format_record(recIDs[irec], 'HDREF', ln=ln, user_info=user_info, verbose=verbose))
req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec],
tabs,
ln))
elif tab == 'keywords':
from bibclassify_webinterface import \
record_get_keywords, get_sorting_options, \
generate_keywords, get_keywords_body
from invenio.webinterface_handler import wash_urlargd
form = req.form
argd = wash_urlargd(form, {
'generate': (str, 'no'),
'sort': (str, 'occurrences'),
'type': (str, 'tagcloud'),
'numbering': (str, 'off'),
})
recid = recIDs[irec]
req.write(webstyle_templates.detailed_record_container_top(recid,
tabs, ln))
if argd['generate'] == 'yes':
# The user asked to generate the keywords.
keywords = generate_keywords(req, recid)
else:
# Get the keywords contained in the MARC.
keywords = record_get_keywords(recid, argd)
if keywords:
req.write(get_sorting_options(argd, keywords))
elif argd['sort'] == 'related' and not keywords:
req.write('You may want to run BibIndex.')
# Output the keywords or the generate button.
get_keywords_body(keywords, req, recid, argd)
req.write(webstyle_templates.detailed_record_container_bottom(recid,
tabs, ln))
else:
# Metadata tab
req.write(webstyle_templates.detailed_record_container_top(recIDs[irec],
tabs,
ln,
show_short_rec_p=False))
creationdate = None
modificationdate = None
if record_exists(recIDs[irec]) == 1:
creationdate = get_creation_date(recIDs[irec])
modificationdate = get_modification_date(recIDs[irec])
content = print_record(recIDs[irec], format, ot, ln,
search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
content = websearch_templates.tmpl_detailed_record_metadata(
recID = recIDs[irec],
ln = ln,
format = format,
creationdate = creationdate,
modificationdate = modificationdate,
content = content)
req.write(content)
req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec],
tabs,
ln,
creationdate=creationdate,
modificationdate=modificationdate,
show_short_rec_p=False))
if len(tabs) > 0:
# Add the mini box at bottom of the page
if CFG_WEBCOMMENT_ALLOW_REVIEWS:
from invenio.webcomment import get_mini_reviews
reviews = get_mini_reviews(recid = recIDs[irec], ln=ln)
else:
reviews = ''
actions = format_record(recIDs[irec], 'HDACT', ln=ln, user_info=user_info, verbose=verbose)
files = format_record(recIDs[irec], 'HDFILE', ln=ln, user_info=user_info, verbose=verbose)
req.write(webstyle_templates.detailed_record_mini_panel(recIDs[irec],
ln,
format,
files=files,
reviews=reviews,
actions=actions))
else:
# Other formats
for irec in range(irec_max, irec_min, -1):
req.write(print_record(recIDs[irec], format, ot, ln,
search_pattern=search_pattern,
user_info=user_info, verbose=verbose))
else:
print_warning(req, _("Use different search terms."))
def print_records_prologue(req, format):
"""
Print the appropriate prologue for list of records in the given
format.
"""
prologue = "" # no prologue needed for HTML or Text formats
if format.startswith('xm'):
prologue = websearch_templates.tmpl_xml_marc_prologue()
elif format.startswith('xn'):
prologue = websearch_templates.tmpl_xml_nlm_prologue()
elif format.startswith('xw'):
prologue = websearch_templates.tmpl_xml_refworks_prologue()
elif format.startswith('xr'):
prologue = websearch_templates.tmpl_xml_rss_prologue()
elif format.startswith('xe'):
prologue = websearch_templates.tmpl_xml_endnote_prologue()
elif format.startswith('xo'):
prologue = websearch_templates.tmpl_xml_mods_prologue()
elif format.startswith('x'):
prologue = websearch_templates.tmpl_xml_default_prologue()
req.write(prologue)
def print_records_epilogue(req, format):
"""
Print the appropriate epilogue for list of records in the given
format.
"""
epilogue = "" # no epilogue needed for HTML or Text formats
if format.startswith('xm'):
epilogue = websearch_templates.tmpl_xml_marc_epilogue()
elif format.startswith('xn'):
epilogue = websearch_templates.tmpl_xml_nlm_epilogue()
elif format.startswith('xw'):
epilogue = websearch_templates.tmpl_xml_refworks_epilogue()
elif format.startswith('xr'):
epilogue = websearch_templates.tmpl_xml_rss_epilogue()
elif format.startswith('xe'):
epilogue = websearch_templates.tmpl_xml_endnote_epilogue()
elif format.startswith('xo'):
epilogue = websearch_templates.tmpl_xml_mods_epilogue()
elif format.startswith('x'):
epilogue = websearch_templates.tmpl_xml_default_epilogue()
req.write(epilogue)
def get_record(recid):
"""Directly the record object corresponding to the recid."""
from marshal import loads, dumps
from zlib import compress, decompress
if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE:
value = run_sql('SELECT value FROM bibfmt WHERE id_bibrec=%s AND FORMAT=\'recstruct\'', (recid, ))
if value:
try:
return loads(decompress(value[0][0]))
except:
### In case of corruption, let's rebuild it!
pass
return create_record(print_record(recid, 'xm'))[0]
def print_record(recID, format='hb', ot='', ln=CFG_SITE_LANG, decompress=zlib.decompress,
search_pattern=None, user_info=None, verbose=0):
"""Prints record 'recID' formatted accoding to 'format'."""
-
if format == 'recstruct':
return get_record(recID)
_ = gettext_set_language(ln)
+ #check from user information if the user has the right to see hidden fields/tags in the
+ #records as well
+ can_see_hidden = (acc_authorize_action(user_info, 'runbibedit')[0] == 0)
+
out = ""
# sanity check:
record_exist_p = record_exists(recID)
if record_exist_p == 0: # doesn't exist
return out
# New Python BibFormat procedure for formatting
# Old procedure follows further below
# We must still check some special formats, but these
# should disappear when BibFormat improves.
if not (CFG_BIBFORMAT_USE_OLD_BIBFORMAT \
or format.lower().startswith('t') \
or format.lower().startswith('hm') \
or str(format[0:3]).isdigit() \
or ot):
# Unspecified format is hd
if format == '':
format = 'hd'
if record_exist_p == -1 and get_output_format_content_type(format) == 'text/html':
# HTML output displays a default value for deleted records.
# Other format have to deal with it.
out += _("The record has been deleted.")
else:
out += call_bibformat(recID, format, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
# at the end of HTML brief mode, print the "Detailed record" functionality:
if format.lower().startswith('hb') and \
format.lower() != 'hb_p':
out += websearch_templates.tmpl_print_record_brief_links(
ln = ln,
recID = recID,
)
return out
# Old PHP BibFormat procedure for formatting
# print record opening tags, if needed:
if format == "marcxml" or format == "oai_dc":
out += " \n"
out += " \n"
for oai_id in get_fieldvalues(recID, CFG_OAI_ID_FIELD):
out += " %s\n" % oai_id
out += " %s\n" % get_modification_date(recID)
out += " \n"
out += " \n"
if format.startswith("xm") or format == "marcxml":
# look for detailed format existence:
query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s"
res = run_sql(query, (recID, format), 1)
if res and record_exist_p == 1:
# record 'recID' is formatted in 'format', so print it
out += "%s" % decompress(res[0][0])
else:
# record 'recID' is not formatted in 'format' -- they are not in "bibfmt" table; so fetch all the data from "bibXXx" tables:
if format == "marcxml":
out += """ \n"""
out += " %d\n" % int(recID)
elif format.startswith("xm"):
out += """ \n"""
out += " %d\n" % int(recID)
if record_exist_p == -1:
# deleted record, so display only OAI ID and 980:
oai_ids = get_fieldvalues(recID, CFG_OAI_ID_FIELD)
if oai_ids:
out += "%s\n" % \
(CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4], CFG_OAI_ID_FIELD[4:5], CFG_OAI_ID_FIELD[5:6], oai_ids[0])
out += "DELETED\n"
else:
# controlfields
query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\
"WHERE bb.id_bibrec=%s AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\
"ORDER BY bb.field_number, b.tag ASC"
res = run_sql(query, (recID, ))
for row in res:
field, value = row[0], row[1]
value = encode_for_xml(value)
out += """ %s\n""" % \
(encode_for_xml(field[0:3]), value)
# datafields
i = 1 # Do not process bib00x and bibrec_bib00x, as
# they are controlfields. So start at bib01x and
# bibrec_bib00x (and set i = 0 at the end of
# first loop)
for digit1 in range(0, 10):
for digit2 in range(i, 10):
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\
"WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s"\
"ORDER BY bb.field_number, b.tag ASC" % (bx, bibx)
res = run_sql(query, (recID, str(digit1)+str(digit2)+'%'))
field_number_old = -999
field_old = ""
for row in res:
field, value, field_number = row[0], row[1], row[2]
ind1, ind2 = field[3], field[4]
if ind1 == "_" or ind1 == "":
ind1 = " "
if ind2 == "_" or ind2 == "":
ind2 = " "
- # print field tag
- if field_number != field_number_old or field[:-1] != field_old[:-1]:
- if field_number_old != -999:
- out += """ \n"""
- out += """ \n""" % \
- (encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2))
- field_number_old = field_number
- field_old = field
- # print subfield value
- value = encode_for_xml(value)
- out += """ %s\n""" % \
+ # print field tag, unless hidden
+ printme = True
+ if not can_see_hidden:
+ for htag in CFG_BIBFORMAT_HIDDEN_TAGS:
+ ltag = len(htag)
+ samelenfield = field[0:ltag]
+ if samelenfield == htag:
+ printme = False
+
+ if printme:
+ if field_number != field_number_old or field[:-1] != field_old[:-1]:
+ if field_number_old != -999:
+ out += """ \n"""
+ out += """ \n""" % \
+ (encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2))
+ field_number_old = field_number
+ field_old = field
+ # print subfield value
+ value = encode_for_xml(value)
+ out += """ %s\n""" % \
(encode_for_xml(field[-1:]), value)
# all fields/subfields printed in this run, so close the tag:
if field_number_old != -999:
out += """ \n"""
i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x
# we are at the end of printing the record:
out += " \n"
elif format == "xd" or format == "oai_dc":
# XML Dublin Core format, possibly OAI -- select only some bibXXx fields:
out += """ \n"""
if record_exist_p == -1:
out += ""
else:
for f in get_fieldvalues(recID, "041__a"):
out += " %s\n" % f
for f in get_fieldvalues(recID, "100__a"):
out += " %s\n" % encode_for_xml(f)
for f in get_fieldvalues(recID, "700__a"):
out += " %s\n" % encode_for_xml(f)
for f in get_fieldvalues(recID, "245__a"):
out += " %s\n" % encode_for_xml(f)
for f in get_fieldvalues(recID, "65017a"):
out += " %s\n" % encode_for_xml(f)
for f in get_fieldvalues(recID, "8564_u"):
out += " %s\n" % encode_for_xml(f)
for f in get_fieldvalues(recID, "520__a"):
out += " %s\n" % encode_for_xml(f)
out += " %s\n" % get_creation_date(recID)
out += " \n"
elif len(format) == 6 and str(format[0:3]).isdigit():
# user has asked to print some fields only
if format == "001":
out += "%s\n" % (format, recID, format)
else:
vals = get_fieldvalues(recID, format)
for val in vals:
out += "%s\n" % (format, val, format)
elif format.startswith('t'):
## user directly asked for some tags to be displayed only
if record_exist_p == -1:
- out += get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"])
+ out += get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"], can_see_hidden)
else:
- out += get_fieldvalues_alephseq_like(recID, ot)
+ out += get_fieldvalues_alephseq_like(recID, ot, can_see_hidden)
elif format == "hm":
if record_exist_p == -1:
- out += "\n
"
elif format == "hd":
# HTML detailed format
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
# look for detailed format existence:
query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s"
res = run_sql(query, (recID, format), 1)
if res:
# record 'recID' is formatted in 'format', so print it
out += "%s" % decompress(res[0][0])
else:
# record 'recID' is not formatted in 'format', so try to call BibFormat on the fly or use default format:
out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
if out_record_in_format:
out += out_record_in_format
else:
out += websearch_templates.tmpl_print_record_detailed(
ln = ln,
recID = recID,
)
elif format.startswith("hb_") or format.startswith("hd_"):
# underscore means that HTML brief/detailed formats should be called on-the-fly; suitable for testing formats
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
out += call_bibformat(recID, format, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
elif format.startswith("hx"):
# BibTeX format, called on the fly:
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
out += call_bibformat(recID, format, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
elif format.startswith("hs"):
# for citation/download similarity navigation links:
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
out += '' % websearch_templates.build_search_url(recid=recID, ln=ln)
# firstly, title:
titles = get_fieldvalues(recID, "245__a")
if titles:
for title in titles:
out += "%s" % title
else:
# usual title not found, try conference title:
titles = get_fieldvalues(recID, "111__a")
if titles:
for title in titles:
out += "%s" % title
else:
# just print record ID:
out += "%s %d" % (get_field_i18nname("record ID", ln, False), recID)
out += ""
# secondly, authors:
authors = get_fieldvalues(recID, "100__a") + get_fieldvalues(recID, "700__a")
if authors:
out += " - %s" % authors[0]
if len(authors) > 1:
out += " et al"
# thirdly publication info:
publinfos = get_fieldvalues(recID, "773__s")
if not publinfos:
publinfos = get_fieldvalues(recID, "909C4s")
if not publinfos:
publinfos = get_fieldvalues(recID, "037__a")
if not publinfos:
publinfos = get_fieldvalues(recID, "088__a")
if publinfos:
out += " - %s" % publinfos[0]
else:
# fourthly publication year (if not publication info):
years = get_fieldvalues(recID, "773__y")
if not years:
years = get_fieldvalues(recID, "909C4y")
if not years:
years = get_fieldvalues(recID, "260__c")
if years:
out += " (%s)" % years[0]
else:
# HTML brief format by default
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s"
res = run_sql(query, (recID, format))
if res:
# record 'recID' is formatted in 'format', so print it
out += "%s" % decompress(res[0][0])
else:
# record 'recID' is not formatted in 'format', so try to call BibFormat on the fly: or use default format:
if CFG_WEBSEARCH_CALL_BIBFORMAT:
out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern,
user_info=user_info, verbose=verbose)
if out_record_in_format:
out += out_record_in_format
else:
out += websearch_templates.tmpl_print_record_brief(
ln = ln,
recID = recID,
)
else:
out += websearch_templates.tmpl_print_record_brief(
ln = ln,
recID = recID,
)
# at the end of HTML brief mode, print the "Detailed record" functionality:
if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"):
pass # do nothing for portfolio and on-the-fly formats
else:
out += websearch_templates.tmpl_print_record_brief_links(
ln = ln,
recID = recID,
)
# print record closing tags, if needed:
if format == "marcxml" or format == "oai_dc":
out += " \n"
out += " \n"
return out
def call_bibformat(recID, format="HD", ln=CFG_SITE_LANG, search_pattern=None, user_info=None, verbose=0):
"""
Calls BibFormat and returns formatted record.
BibFormat will decide by itself if old or new BibFormat must be used.
"""
keywords = []
if search_pattern is not None:
units = create_basic_search_units(None, str(search_pattern), None)
keywords = [unit[1] for unit in units if unit[0] != '-']
return format_record(recID,
of=format,
ln=ln,
search_pattern=keywords,
user_info=user_info,
verbose=verbose)
def log_query(hostname, query_args, uid=-1):
"""
Log query into the query and user_query tables.
Return id_query or None in case of problems.
"""
id_query = None
if uid >= 0:
# log the query only if uid is reasonable
res = run_sql("SELECT id FROM query WHERE urlargs=%s", (query_args,), 1)
try:
id_query = res[0][0]
except:
id_query = run_sql("INSERT INTO query (type, urlargs) VALUES ('r', %s)", (query_args,))
if id_query:
run_sql("INSERT INTO user_query (id_user, id_query, hostname, date) VALUES (%s, %s, %s, %s)",
(uid, id_query, hostname,
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
return id_query
def log_query_info(action, p, f, colls, nb_records_found_total=-1):
"""Write some info to the log file for later analysis."""
try:
log = open(CFG_LOGDIR + "/search.log", "a")
log.write(time.strftime("%Y%m%d%H%M%S#", time.localtime()))
log.write(action+"#")
log.write(p+"#")
log.write(f+"#")
for coll in colls[:-1]:
log.write("%s," % coll)
log.write("%s#" % colls[-1])
log.write("%d" % nb_records_found_total)
log.write("\n")
log.close()
except:
pass
return
### CALLABLES
def perform_request_search(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=10, sf="", so="d", sp="", rm="", of="id", ot="", aas=0,
p1="", f1="", m1="", op1="", p2="", f2="", m2="", op2="", p3="", f3="", m3="", sc=0, jrec=0,
recid=-1, recidb=-1, sysno="", id=-1, idb=-1, sysnb="", action="", d1="",
d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0, dt="", verbose=0, ap=0, ln=CFG_SITE_LANG, ec=None, tab=""):
"""Perform search or browse request, without checking for
authentication. Return list of recIDs found, if of=id.
Otherwise create web page.
The arguments are as follows:
req - mod_python Request class instance.
cc - current collection (e.g. "ATLAS"). The collection the
user started to search/browse from.
c - collection list (e.g. ["Theses", "Books"]). The
collections user may have selected/deselected when
starting to search from 'cc'.
p - pattern to search for (e.g. "ellis and muon or kaon").
f - field to search within (e.g. "author").
rg - records in groups of (e.g. "10"). Defines how many hits
per collection in the search results page are
displayed.
sf - sort field (e.g. "title").
so - sort order ("a"=ascending, "d"=descending).
sp - sort pattern (e.g. "CERN-") -- in case there are more
values in a sort field, this argument tells which one
to prefer
rm - ranking method (e.g. "jif"). Defines whether results
should be ranked by some known ranking method.
of - output format (e.g. "hb"). Usually starting "h" means
HTML output (and "hb" for HTML brief, "hd" for HTML
detailed), "x" means XML output, "t" means plain text
output, "id" means no output at all but to return list
of recIDs found. (Suitable for high-level API.)
ot - output only these MARC tags (e.g. "100,700,909C0b").
Useful if only some fields are to be shown in the
output, e.g. for library to control some fields.
aas - advanced search ("0" means no, "1" means yes). Whether
search was called from within the advanced search
interface.
p1 - first pattern to search for in the advanced search
interface. Much like 'p'.
f1 - first field to search within in the advanced search
interface. Much like 'f'.
m1 - first matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact
phrase, "p" partial phrase, "r" regular expression).
op1 - first operator, to join the first and the second unit
in the advanced search interface. ("a" add, "o" or,
"n" not).
p2 - second pattern to search for in the advanced search
interface. Much like 'p'.
f2 - second field to search within in the advanced search
interface. Much like 'f'.
m2 - second matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact
phrase, "p" partial phrase, "r" regular expression).
op2 - second operator, to join the second and the third unit
in the advanced search interface. ("a" add, "o" or,
"n" not).
p3 - third pattern to search for in the advanced search
interface. Much like 'p'.
f3 - third field to search within in the advanced search
interface. Much like 'f'.
m3 - third matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact
phrase, "p" partial phrase, "r" regular expression).
sc - split by collection ("0" no, "1" yes). Governs whether
we want to present the results in a single huge list,
or splitted by collection.
jrec - jump to record (e.g. "234"). Used for navigation
inside the search results.
recid - display record ID (e.g. "20000"). Do not
search/browse but go straight away to the Detailed
record page for the given recID.
recidb - display record ID bis (e.g. "20010"). If greater than
'recid', then display records from recid to recidb.
Useful for example for dumping records from the
database for reformatting.
sysno - display old system SYS number (e.g. ""). If you
migrate to CDS Invenio from another system, and store your
old SYS call numbers, you can use them instead of recid
if you wish so.
id - the same as recid, in case recid is not set. For
backwards compatibility.
idb - the same as recid, in case recidb is not set. For
backwards compatibility.
sysnb - the same as sysno, in case sysno is not set. For
backwards compatibility.
action - action to do. "SEARCH" for searching, "Browse" for
browsing. Default is to search.
d1 - first datetime in full YYYY-mm-dd HH:MM:DD format
(e.g. "1998-08-23 12:34:56"). Useful for search limits
on creation/modification date (see 'dt' argument
below). Note that 'd1' takes precedence over d1y, d1m,
d1d if these are defined.
d1y - first date's year (e.g. "1998"). Useful for search
limits on creation/modification date.
d1m - first date's month (e.g. "08"). Useful for search
limits on creation/modification date.
d1d - first date's day (e.g. "23"). Useful for search
limits on creation/modification date.
d2 - second datetime in full YYYY-mm-dd HH:MM:DD format
(e.g. "1998-09-02 12:34:56"). Useful for search limits
on creation/modification date (see 'dt' argument
below). Note that 'd2' takes precedence over d2y, d2m,
d2d if these are defined.
d2y - second date's year (e.g. "1998"). Useful for search
limits on creation/modification date.
d2m - second date's month (e.g. "09"). Useful for search
limits on creation/modification date.
d2d - second date's day (e.g. "02"). Useful for search
limits on creation/modification date.
dt - first and second date's type (e.g. "c"). Specifies
whether to search in creation dates ("c") or in
modification dates ("m"). When dt is not set and d1*
and d2* are set, the default is "c".
verbose - verbose level (0=min, 9=max). Useful to print some
internal information on the searching process in case
something goes wrong.
ap - alternative patterns (0=no, 1=yes). In case no exact
match is found, the search engine can try alternative
patterns e.g. to replace non-alphanumeric characters by
a boolean query. ap defines if this is wanted.
ln - language of the search interface (e.g. "en"). Useful
for internationalization.
ec - list of external search engines to search as well
(e.g. "SPIRES HEP").
"""
selected_external_collections_infos = None
# wash output format:
of = wash_output_format(of)
- # raise an exception when trying to print out html or xml from the cli
- if of.startswith("h") or of.startswith("x"):
+ # raise an exception when trying to print out html from the cli
+ if of.startswith("h"):
assert req
# for every search engine request asking for an HTML output, we
# first regenerate cache of collection and field I18N names if
# needed; so that later we won't bother checking timestamps for
# I18N names at all:
if of.startswith("h"):
collection_i18nname_cache.recreate_cache_if_needed()
field_i18nname_cache.recreate_cache_if_needed()
# wash all arguments requiring special care
try:
(cc, colls_to_display, colls_to_search, hosted_colls, wash_colls_debug) = wash_colls(cc, c, sc, verbose) # which colls to search and to display?
except InvenioWebSearchUnknownCollectionError, exc:
colname = exc.colname
if of.startswith("h"):
page_start(req, of, cc, aas, ln, getUid(req),
websearch_templates.tmpl_collection_not_found_page_title(colname, ln))
req.write(websearch_templates.tmpl_collection_not_found_page_body(colname, ln))
return page_end(req, of, ln)
elif of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
else:
return page_end(req, of, ln)
p = wash_pattern(p)
f = wash_field(f)
p1 = wash_pattern(p1)
f1 = wash_field(f1)
p2 = wash_pattern(p2)
f2 = wash_field(f2)
p3 = wash_pattern(p3)
f3 = wash_field(f3)
datetext1, datetext2 = wash_dates(d1, d1y, d1m, d1d, d2, d2y, d2m, d2d)
# wash ranking method:
if not is_method_valid(None, rm):
rm = ""
_ = gettext_set_language(ln)
# backwards compatibility: id, idb, sysnb -> recid, recidb, sysno (if applicable)
if sysnb != "" and sysno == "":
sysno = sysnb
if id > 0 and recid == -1:
recid = id
if idb > 0 and recidb == -1:
recidb = idb
# TODO deduce passed search limiting criterias (if applicable)
pl, pl_in_url = "", "" # no limits by default
if action != "browse" and req and not isinstance(req, cStringIO.OutputType) \
and req.args: # we do not want to add options while browsing or while calling via command-line
fieldargs = cgi.parse_qs(req.args)
for fieldcode in get_fieldcodes():
if fieldargs.has_key(fieldcode):
for val in fieldargs[fieldcode]:
pl += "+%s:\"%s\" " % (fieldcode, val)
pl_in_url += "&%s=%s" % (urllib.quote(fieldcode), urllib.quote(val))
# deduce recid from sysno argument (if applicable):
if sysno: # ALEPH SYS number was passed, so deduce DB recID for the record:
recid = get_mysql_recid_from_aleph_sysno(sysno)
if recid is None:
recid = 0 # use recid 0 to indicate that this sysno does not exist
# deduce collection we are in (if applicable):
if recid > 0:
referer = None
if req:
referer = req.headers_in.get('Referer')
cc = guess_collection_of_a_record(recid, referer)
# deduce user id (if applicable):
try:
uid = getUid(req)
except:
uid = 0
## 0 - start output
if recid >= 0: # recid can be 0 if deduced from sysno and if such sysno does not exist
## 1 - detailed record display
title, description, keywords = \
websearch_templates.tmpl_record_page_header_content(req, recid, ln)
if req is not None and not req.header_only:
page_start(req, of, cc, aas, ln, uid, title, description, keywords, recid, tab)
# Default format is hb but we are in detailed -> change 'of'
if of == "hb":
of = "hd"
if record_exists(recid):
if recidb <= recid: # sanity check
recidb = recid + 1
if of == "id":
return [recidx for recidx in range(recid, recidb) if record_exists(recidx)]
else:
print_records(req, range(recid, recidb), -1, -9999, of, ot, ln, search_pattern=p, verbose=verbose, tab=tab)
if req and of.startswith("h"): # register detailed record page view event
client_ip_address = str(req.remote_ip)
register_page_view_event(recid, uid, client_ip_address)
else: # record does not exist
if of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
elif of.startswith("h"):
if req.header_only:
raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND
else:
print_warning(req, _("Requested record does not seem to exist."))
elif action == "browse":
## 2 - browse needed
of = 'hb'
page_start(req, of, cc, aas, ln, uid, _("Browse"), p=create_page_title_search_pattern_info(p, p1, p2, p3))
req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1,
p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action))
try:
if aas == 1 or (p1 or p2 or p3):
browse_pattern(req, colls_to_search, p1, f1, rg, ln)
browse_pattern(req, colls_to_search, p2, f2, rg, ln)
browse_pattern(req, colls_to_search, p3, f3, rg, ln)
else:
browse_pattern(req, colls_to_search, p, f, rg, ln)
except:
register_exception(req=req, alert_admin=True)
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
elif rm and p.startswith("recid:"):
## 3-ter - similarity search or citation search needed
if req and not req.header_only:
page_start(req, of, cc, aas, ln, uid, _("Search Results"), p=create_page_title_search_pattern_info(p, p1, p2, p3))
if of.startswith("h"):
req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1,
p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action))
if record_exists(p[6:]) != 1:
# record does not exist
if of.startswith("h"):
if req.header_only:
raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND
else:
print_warning(req, "Requested record does not seem to exist.")
if of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
else:
# record well exists, so find similar ones to it
t1 = os.times()[4]
results_similar_recIDs, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, results_similar_comments = \
rank_records(rm, 0, get_collection_reclist(cc), string.split(p), verbose)
if results_similar_recIDs:
t2 = os.times()[4]
cpu_time = t2 - t1
if of.startswith("h"):
req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, cc, len(results_similar_recIDs),
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time))
print_warning(req, results_similar_comments)
print_records(req, results_similar_recIDs, jrec, rg, of, ot, ln,
results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, search_pattern=p, verbose=verbose)
elif of=="id":
return results_similar_recIDs
elif of.startswith("x"):
print_records(req, results_similar_recIDs, jrec, rg, of, ot, ln,
results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, search_pattern=p, verbose=verbose)
else:
# rank_records failed and returned some error message to display:
if of.startswith("h"):
print_warning(req, results_similar_relevances_prologue)
print_warning(req, results_similar_relevances_epilogue)
print_warning(req, results_similar_comments)
if of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
elif p.startswith("cocitedwith:"): #WAS EXPERIMENTAL
## 3-terter - cited by search needed
page_start(req, of, cc, aas, ln, uid, _("Search Results"), p=create_page_title_search_pattern_info(p, p1, p2, p3))
if of.startswith("h"):
req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1,
p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action))
recID = p[12:]
if record_exists(recID) != 1:
# record does not exist
if of.startswith("h"):
print_warning(req, "Requested record does not seem to exist.")
if of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
else:
# record well exists, so find co-cited ones:
t1 = os.times()[4]
results_cocited_recIDs = map(lambda x: x[0], calculate_co_cited_with_list(int(recID)))
if results_cocited_recIDs:
t2 = os.times()[4]
cpu_time = t2 - t1
if of.startswith("h"):
req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, CFG_SITE_NAME, len(results_cocited_recIDs),
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time))
print_records(req, results_cocited_recIDs, jrec, rg, of, ot, ln, search_pattern=p, verbose=verbose)
elif of=="id":
return results_cocited_recIDs
elif of.startswith("x"):
print_records(req, results_cocited_recIDs, jrec, rg, of, ot, ln, search_pattern=p, verbose=verbose)
else:
# cited rank_records failed and returned some error message to display:
if of.startswith("h"):
print_warning(req, "nothing found")
if of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
else:
## 3 - common search needed
query_in_cache = False
query_representation_in_cache = repr((p,f,colls_to_search))
page_start(req, of, cc, aas, ln, uid, p=create_page_title_search_pattern_info(p, p1, p2, p3))
if of.startswith("h") and verbose and wash_colls_debug:
print_warning(req, "wash_colls debugging info : %s" % wash_colls_debug)
# search into the hosted collections only if the output format is html or xml
if hosted_colls and (of.startswith("h") or of.startswith("x")) and not p.startswith("recid:"):
# hosted_colls_results : the hosted collections' searches that did not timeout
# hosted_colls_timeouts : the hosted collections' searches that timed out and will be searched later on again
(hosted_colls_results, hosted_colls_timeouts) = calculate_hosted_collections_results(req, [p, p1, p2, p3], f, hosted_colls, verbose, ln, CFG_HOSTED_COLLECTION_TIMEOUT_ANTE_SEARCH)
# successful searches
if hosted_colls_results:
hosted_colls_true_results = []
for result in hosted_colls_results:
# if the number of results is None or 0 (or False) then just do nothing
if result[1] == None or result[1] == False:
# these are the searches the returned no or zero results
if verbose:
print_warning(req, "Hosted collections (perform_search_request): %s returned no results" % result[0][1].name)
else:
# these are the searches that actually returned results on time
hosted_colls_true_results.append(result)
if verbose:
print_warning(req, "Hosted collections (perform_search_request): %s returned %s results in %s seconds" % (result[0][1].name, result[1], result[2]))
else:
if verbose:
print_warning(req, "Hosted collections (perform_search_request): there were no hosted collections results to be printed at this time")
if hosted_colls_timeouts:
if verbose:
for timeout in hosted_colls_timeouts:
print_warning(req, "Hosted collections (perform_search_request): %s timed out and will be searched again later" % timeout[0][1].name)
# we need to know for later use if there were any hosted collections to be searched even if they weren't in the end
elif hosted_colls and ((not (of.startswith("h") or of.startswith("x"))) or p.startswith("recid:")):
(hosted_colls_results, hosted_colls_timeouts) = (None, None)
else:
if verbose:
print_warning(req, "Hosted collections (perform_search_request): there were no hosted collections to be searched")
## let's define some useful boolean variables:
# True means there are actual or potential hosted collections results to be printed
hosted_colls_actual_or_potential_results_p = not (not hosted_colls or not ((hosted_colls_results and hosted_colls_true_results) or hosted_colls_timeouts))
# True means there are hosted collections timeouts to take care of later
# (useful for more accurate printing of results later)
hosted_colls_potential_results_p = not (not hosted_colls or not hosted_colls_timeouts)
# True means we only have hosted collections to deal with
only_hosted_colls_actual_or_potential_results_p = not colls_to_search and hosted_colls_actual_or_potential_results_p
if of.startswith("h"):
req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1,
p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action))
t1 = os.times()[4]
results_in_any_collection = HitSet()
if aas == 1 or (p1 or p2 or p3):
## 3A - advanced search
try:
results_in_any_collection = search_pattern_parenthesised(req, p1, f1, m1, ap=ap, of=of, verbose=verbose, ln=ln)
if len(results_in_any_collection) == 0:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
if p2:
results_tmp = search_pattern_parenthesised(req, p2, f2, m2, ap=ap, of=of, verbose=verbose, ln=ln)
if op1 == "a": # add
results_in_any_collection.intersection_update(results_tmp)
elif op1 == "o": # or
results_in_any_collection.union_update(results_tmp)
elif op1 == "n": # not
results_in_any_collection.difference_update(results_tmp)
else:
if of.startswith("h"):
print_warning(req, "Invalid set operation %s." % cgi.escape(op1), "Error")
if len(results_in_any_collection) == 0:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
if p3:
results_tmp = search_pattern_parenthesised(req, p3, f3, m3, ap=ap, of=of, verbose=verbose, ln=ln)
if op2 == "a": # add
results_in_any_collection.intersection_update(results_tmp)
elif op2 == "o": # or
results_in_any_collection.union_update(results_tmp)
elif op2 == "n": # not
results_in_any_collection.difference_update(results_tmp)
else:
if of.startswith("h"):
print_warning(req, "Invalid set operation %s." % cgi.escape(op2), "Error")
except:
register_exception(req=req, alert_admin=True)
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
else:
## 3B - simple search
if search_results_cache.cache.has_key(query_representation_in_cache):
# query is not in the cache already, so reuse it:
query_in_cache = True
results_in_any_collection = search_results_cache.cache[query_representation_in_cache]
if verbose and of.startswith("h"):
print_warning(req, "Search stage 0: query found in cache, reusing cached results.")
else:
try:
# added the display_nearest_terms_box parameter to avoid printing out the "Nearest terms in any collection"
# recommendations when there are results only in the hosted collections. Also added the if clause to avoid
# searching in case we know we only have actual or potential hosted collections results
if not only_hosted_colls_actual_or_potential_results_p:
results_in_any_collection = search_pattern_parenthesised(req, p, f, ap=ap, of=of, verbose=verbose, ln=ln, display_nearest_terms_box=not hosted_colls_actual_or_potential_results_p)
except:
register_exception(req=req, alert_admin=True)
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
if len(results_in_any_collection) == 0 and not hosted_colls_actual_or_potential_results_p:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
# store this search query results into search results cache if needed:
if CFG_WEBSEARCH_SEARCH_CACHE_SIZE and not query_in_cache:
if len(search_results_cache.cache) > CFG_WEBSEARCH_SEARCH_CACHE_SIZE:
search_results_cache.clear()
search_results_cache.cache[query_representation_in_cache] = results_in_any_collection
if verbose and of.startswith("h"):
print_warning(req, "Search stage 3: storing query results in cache.")
# search stage 4: intersection with collection universe:
try:
# added the display_nearest_terms_box parameter to avoid printing out the "Nearest terms in any collection"
# recommendations when there results only in the hosted collections. Also added the if clause to avoid
# searching in case we know since the last stage that we have no results in any collection
if len(results_in_any_collection) != 0:
results_final = intersect_results_with_collrecs(req, results_in_any_collection, colls_to_search, ap, of, verbose, ln, display_nearest_terms_box=not hosted_colls_actual_or_potential_results_p)
else:
results_final = {}
except:
register_exception(req=req, alert_admin=True)
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
if results_final == {} and not hosted_colls_actual_or_potential_results_p:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
if of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
# search stage 5: apply search option limits and restrictions:
if datetext1 != "" and results_final != {}:
if verbose and of.startswith("h"):
print_warning(req, "Search stage 5: applying time etc limits, from %s until %s..." % (datetext1, datetext2))
try:
results_final = intersect_results_with_hitset(req,
results_final,
search_unit_in_bibrec(datetext1, datetext2, dt),
ap,
aptext= _("No match within your time limits, "
"discarding this condition..."),
of=of)
except:
register_exception(req=req, alert_admin=True)
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
if results_final == {} and not hosted_colls_actual_or_potential_results_p:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
#if of.startswith("x"):
# # Print empty, but valid XML
# print_records_prologue(req, of)
# print_records_epilogue(req, of)
return page_end(req, of, ln)
if pl and results_final != {}:
pl = wash_pattern(pl)
if verbose and of.startswith("h"):
print_warning(req, "Search stage 5: applying search pattern limit %s..." % cgi.escape(pl))
try:
results_final = intersect_results_with_hitset(req,
results_final,
search_pattern_parenthesised(req, pl, ap=0, ln=ln),
ap,
aptext=_("No match within your search limits, "
"discarding this condition..."),
of=of)
except:
register_exception(req=req, alert_admin=True)
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
if results_final == {} and not hosted_colls_actual_or_potential_results_p:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
if of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
t2 = os.times()[4]
cpu_time = t2 - t1
## search stage 6: display results:
results_final_nb_total = 0
results_final_nb = {} # will hold number of records found in each collection
# (in simple dict to display overview more easily)
for coll in results_final.keys():
results_final_nb[coll] = len(results_final[coll])
#results_final_nb_total += results_final_nb[coll]
# Now let us calculate results_final_nb_total more precisely,
# in order to get the total number of "distinct" hits across
# searched collections; this is useful because a record might
# have been attributed to more than one primary collection; so
# we have to avoid counting it multiple times. The price to
# pay for this accuracy of results_final_nb_total is somewhat
# increased CPU time.
if results_final.keys() == 1:
# only one collection; no need to union them
results_final_for_all_selected_colls = results_final.values()[0]
results_final_nb_total = results_final_nb.values()[0]
else:
# okay, some work ahead to union hits across collections:
results_final_for_all_selected_colls = HitSet()
for coll in results_final.keys():
results_final_for_all_selected_colls.union_update(results_final[coll])
results_final_nb_total = len(results_final_for_all_selected_colls)
#if hosted_colls and (of.startswith("h") or of.startswith("x")):
if hosted_colls_actual_or_potential_results_p:
if hosted_colls_results:
for result in hosted_colls_true_results:
colls_to_search.append(result[0][1].name)
results_final_nb[result[0][1].name] = result[1]
results_final_nb_total += result[1]
cpu_time += result[2]
if hosted_colls_timeouts:
for timeout in hosted_colls_timeouts:
colls_to_search.append(timeout[1].name)
# use -963 as a special number to identify the collections that timed out
results_final_nb[timeout[1].name] = -963
# we continue past this point only if there is a hosted collection that has timed out and might offer potential results
if results_final_nb_total ==0 and not hosted_colls_potential_results_p:
if of.startswith("h"):
print_warning(req, "No match found, please enter different search terms.")
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
else:
# yes, some hits found: good!
# collection list may have changed due to not-exact-match-found policy so check it out:
for coll in results_final.keys():
if coll not in colls_to_search:
colls_to_search.append(coll)
# print results overview:
if of == "id":
# we have been asked to return list of recIDs
recIDs = list(results_final_for_all_selected_colls)
if sf: # do we have to sort?
recIDs = sort_records(req, recIDs, sf, so, sp, verbose, of)
elif rm: # do we have to rank?
results_final_for_all_colls_rank_records_output = rank_records(rm, 0, results_final_for_all_selected_colls,
string.split(p) + string.split(p1) +
string.split(p2) + string.split(p3), verbose)
if results_final_for_all_colls_rank_records_output[0]:
recIDs = results_final_for_all_colls_rank_records_output[0]
return recIDs
elif of.startswith("h"):
if of not in ['hcs']:
# added the hosted_colls_potential_results_p parameter to help print out the overview more accurately
req.write(print_results_overview(req, colls_to_search, results_final_nb_total, results_final_nb, cpu_time, ln, ec, hosted_colls_potential_results_p=hosted_colls_potential_results_p))
selected_external_collections_infos = print_external_results_overview(req, cc, [p, p1, p2, p3], f, ec, verbose, ln)
# print number of hits found for XML outputs:
if of.startswith("x"):
req.write("\n" % results_final_nb_total)
# print records:
if of in ['hcs']:
# feed the current search to be summarized:
from invenio.search_engine_summarizer import summarize_records
summarize_records(results_final_for_all_selected_colls, 'hcs', ln, p, f, req)
else:
if len(colls_to_search)>1:
cpu_time = -1 # we do not want to have search time printed on each collection
print_records_prologue(req, of)
for coll in colls_to_search:
if results_final.has_key(coll) and len(results_final[coll]):
if of.startswith("h"):
req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, coll, results_final_nb[coll],
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time))
results_final_recIDs = list(results_final[coll])
results_final_relevances = []
results_final_relevances_prologue = ""
results_final_relevances_epilogue = ""
if sf: # do we have to sort?
results_final_recIDs = sort_records(req, results_final_recIDs, sf, so, sp, verbose, of)
elif rm: # do we have to rank?
results_final_recIDs_ranked, results_final_relevances, results_final_relevances_prologue, results_final_relevances_epilogue, results_final_comments = \
rank_records(rm, 0, results_final[coll],
string.split(p) + string.split(p1) +
string.split(p2) + string.split(p3), verbose)
if of.startswith("h"):
print_warning(req, results_final_comments)
if results_final_recIDs_ranked:
results_final_recIDs = results_final_recIDs_ranked
else:
# rank_records failed and returned some error message to display:
print_warning(req, results_final_relevances_prologue)
print_warning(req, results_final_relevances_epilogue)
print_records(req, results_final_recIDs, jrec, rg, of, ot, ln,
results_final_relevances,
results_final_relevances_prologue,
results_final_relevances_epilogue,
search_pattern=p,
print_records_prologue_p=False,
print_records_epilogue_p=False,
verbose=verbose)
if of.startswith("h"):
req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, coll, results_final_nb[coll],
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1))
#if hosted_colls and (of.startswith("h") or of.startswith("x")):
if hosted_colls_actual_or_potential_results_p:
if hosted_colls_results:
# TODO: add a verbose message here
for result in hosted_colls_true_results:
if of.startswith("h"):
req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, results_final_nb[result[0][1].name],
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time))
req.write(print_hosted_results(url_and_engine=result[0], ln=ln, of=of, req=req, limit=rg))
if of.startswith("h"):
req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, results_final_nb[result[0][1].name],
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1))
if hosted_colls_timeouts:
# TODO: add a verbose message here
# TODO: check if verbose messages still work when dealing with (re)calculations of timeouts
(hosted_colls_timeouts_results, hosted_colls_timeouts_timeouts) = do_calculate_hosted_collections_results(req, ln, None, verbose, None, hosted_colls_timeouts, CFG_HOSTED_COLLECTION_TIMEOUT_POST_SEARCH)
if hosted_colls_timeouts_results:
hosted_colls_timeouts_true_results = []
for result in hosted_colls_timeouts_results:
if result[1] == None or result[1] == False:
## these are the searches the returned no or zero results
## also print a nearest terms box, in case this is the only
## collection being searched and it returns no results?
if of.startswith("h"):
req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, -963,
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time))
req.write(print_hosted_results(url_and_engine=result[0], ln=ln, of=of, req=req, no_records_found=True, limit=rg))
req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, -963,
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1))
else:
# these are the searches that actually returned results on time
if of.startswith("h"):
req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, result[1],
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time))
req.write(print_hosted_results(url_and_engine=result[0], ln=ln, of=of, req=req, limit=rg))
if of.startswith("h"):
req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, result[1],
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1))
if hosted_colls_timeouts_timeouts:
for timeout in hosted_colls_timeouts_timeouts:
if of.startswith("h"):
req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, timeout[1].name, -963,
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time))
req.write(print_hosted_results(url_and_engine=timeout[0], ln=ln, of=of, req=req, search_timed_out=True, limit=rg))
req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, timeout[1].name, -963,
jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1))
print_records_epilogue(req, of)
if f == "author" and of.startswith("h"):
req.write(create_similarly_named_authors_link_box(p, ln))
# log query:
try:
id_query = log_query(req.remote_host, req.args, uid)
if of.startswith("h") and id_query:
if not of in ['hcs']:
# display alert/RSS teaser for non-summary formats:
display_email_alert_part = collect_user_info(req)['precached_usealerts']
req.write(websearch_templates.tmpl_alert_rss_teaser_box_for_query(id_query, \
ln=ln, display_email_alert_part=display_email_alert_part))
except:
# do not log query if req is None (used by CLI interface)
raise
pass
log_query_info("ss", p, f, colls_to_search, results_final_nb_total)
# External searches
if of.startswith("h"):
if not of in ['hcs']:
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
def perform_request_cache(req, action="show"):
"""Manipulates the search engine cache."""
req.content_type = "text/html"
req.send_http_header()
req.write("")
out = ""
out += "
Search Cache
"
# clear cache if requested:
if action == "clear":
search_results_cache.clear()
req.write(out)
# show collection reclist cache:
out = "
Collection reclist cache
"
out += "- collection table last updated: %s" % get_table_update_time('collection')
out += " - reclist cache timestamp: %s" % collection_reclist_cache.timestamp
out += " - reclist cache contents:"
out += "
"
for coll in collection_reclist_cache.cache.keys():
if collection_reclist_cache.cache[coll]:
out += "%s (%d) " % (coll, len(collection_reclist_cache.cache[coll]))
out += "
"
req.write(out)
# show search results cache:
out = "
Search Cache
"
out += "- search cache usage: %d queries cached (max. ~%d)" % \
(len(search_results_cache.cache), CFG_WEBSEARCH_SEARCH_CACHE_SIZE)
if len(search_results_cache.cache):
out += " - search cache contents:"
out += "
"
for query, hitset in search_results_cache.cache.items():
out += " %s ... %s" % (query, hitset)
out += """
"
req.write(out)
# show field i18nname cache:
out = "
Field I18N names cache
"
out += "- fieldname table last updated: %s" % get_table_update_time('fieldname')
out += " - i18nname cache timestamp: %s" % field_i18nname_cache.timestamp
out += " - i18nname cache contents:"
out += "
"
for field in field_i18nname_cache.cache.keys():
for ln in field_i18nname_cache.cache[field].keys():
out += "%s, %s = %s " % (field, ln, field_i18nname_cache.cache[field][ln])
out += "
"
req.write(out)
# show collection i18nname cache:
out = "
Collection I18N names cache
"
out += "- collectionname table last updated: %s" % get_table_update_time('collectionname')
out += " - i18nname cache timestamp: %s" % collection_i18nname_cache.timestamp
out += " - i18nname cache contents:"
out += "
"
for coll in collection_i18nname_cache.cache.keys():
for ln in collection_i18nname_cache.cache[coll].keys():
out += "%s, %s = %s " % (coll, ln, collection_i18nname_cache.cache[coll][ln])
out += "
"
req.write(out)
req.write("")
return "\n"
def perform_request_log(req, date=""):
"""Display search log information for given date."""
req.content_type = "text/html"
req.send_http_header()
req.write("")
req.write("
Search Log
")
if date: # case A: display stats for a day
yyyymmdd = string.atoi(date)
req.write("
Date: %d
" % yyyymmdd)
req.write("""
""")
req.write("
%s
%s
%s
%s
%s
%s
" % ("No.", "Time", "Pattern", "Field", "Collection", "Number of Hits"))
# read file:
p = os.popen("grep ^%d %s/search.log" % (yyyymmdd, CFG_LOGDIR), 'r')
lines = p.readlines()
p.close()
# process lines:
i = 0
for line in lines:
try:
datetime, aas, p, f, c, nbhits = string.split(line,"#")
i += 1
req.write("
")
else: # case B: display summary stats per day
yyyymm01 = int(time.strftime("%Y%m01", time.localtime()))
yyyymmdd = int(time.strftime("%Y%m%d", time.localtime()))
req.write("""
""")
req.write("
%s
%s
" % ("Day", "Number of Queries"))
for day in range(yyyymm01, yyyymmdd + 1):
p = os.popen("grep -c ^%d %s/search.log" % (day, CFG_LOGDIR), 'r')
for line in p.readlines():
req.write("""
")
req.write("")
return "\n"
def get_most_popular_field_values(recids, tags, exclude_values=None, count_repetitive_values=True):
"""
Analyze RECIDS and look for TAGS and return most popular values
and the frequency with which they occur sorted according to
descending frequency.
If a value is found in EXCLUDE_VALUES, then do not count it.
If COUNT_REPETITIVE_VALUES is True, then we count every occurrence
of value in the tags. If False, then we count the value only once
regardless of the number of times it may appear in a record.
(But, if the same value occurs in another record, we count it, of
course.)
Example:
>>> get_most_popular_field_values(range(11,20), '980__a')
(('PREPRINT', 10), ('THESIS', 7), ...)
>>> get_most_popular_field_values(range(11,20), ('100__a', '700__a'))
(('Ellis, J', 10), ('Ellis, N', 7), ...)
>>> get_most_popular_field_values(range(11,20), ('100__a', '700__a'), ('Ellis, J'))
(('Ellis, N', 7), ...)
"""
def _get_most_popular_field_values_helper_sorter(val1, val2):
"Compare VAL1 and VAL2 according to, firstly, frequency, then secondly, alphabetically."
compared_via_frequencies = cmp(valuefreqdict[val2], valuefreqdict[val1])
if compared_via_frequencies == 0:
return cmp(val1.lower(), val2.lower())
else:
return compared_via_frequencies
valuefreqdict = {}
## sanity check:
if not exclude_values:
exclude_values = []
if isinstance(tags, str):
tags = (tags,)
## find values to count:
vals_to_count = []
if count_repetitive_values:
# counting technique A: can look up many records at once: (very fast)
for tag in tags:
vals_to_count.extend(get_fieldvalues(recids, tag))
else:
# counting technique B: must count record-by-record: (slow)
for recid in recids:
vals_in_rec = []
for tag in tags:
for val in get_fieldvalues(recid, tag, False):
vals_in_rec.append(val)
# do not count repetitive values within this record
# (even across various tags, so need to unify again):
dtmp = {}
for val in vals_in_rec:
dtmp[val] = 1
vals_in_rec = dtmp.keys()
vals_to_count.extend(vals_in_rec)
## are we to exclude some of found values?
for val in vals_to_count:
if val not in exclude_values:
if valuefreqdict.has_key(val):
valuefreqdict[val] += 1
else:
valuefreqdict[val] = 1
## sort by descending frequency of values:
out = ()
vals = valuefreqdict.keys()
vals.sort(_get_most_popular_field_values_helper_sorter)
for val in vals:
out += (val, valuefreqdict[val]),
return out
def profile(p="", f="", c=CFG_SITE_NAME):
"""Profile search time."""
import profile
import pstats
profile.run("perform_request_search(p='%s',f='%s', c='%s')" % (p, f, c), "perform_request_search_profile")
p = pstats.Stats("perform_request_search_profile")
p.strip_dirs().sort_stats("cumulative").print_stats()
return 0
## test cases:
#print wash_colls(CFG_SITE_NAME,"Library Catalogue", 0)
#print wash_colls("Periodicals & Progress Reports",["Periodicals","Progress Reports"], 0)
#print wash_field("wau")
#print print_record(20,"tm","001,245")
#print create_opft_search_units(None, "PHE-87-13","reportnumber")
#print ":"+wash_pattern("* and % doo * %")+":\n"
#print ":"+wash_pattern("*")+":\n"
#print ":"+wash_pattern("ellis* ell* e*%")+":\n"
#print run_sql("SELECT name,dbquery from collection")
#print get_index_id("author")
#print get_coll_ancestors("Theses")
#print get_coll_sons("Articles & Preprints")
#print get_coll_real_descendants("Articles & Preprints")
#print get_collection_reclist("Theses")
#print log(sys.stdin)
#print search_unit_in_bibrec('2002-12-01','2002-12-12')
#print get_nearest_terms_in_bibxxx("ellis", "author", 5, 5)
#print call_bibformat(68, "HB_FLY")
#print get_fieldvalues(10, "980__a")
#print get_fieldvalues_alephseq_like(10,"001___")
#print get_fieldvalues_alephseq_like(10,"980__a")
#print get_fieldvalues_alephseq_like(10,"foo")
#print get_fieldvalues_alephseq_like(10,"-1")
#print get_fieldvalues_alephseq_like(10,"99")
#print get_fieldvalues_alephseq_like(10,["001", "980"])
## profiling:
#profile("of the this")
#print perform_request_search(p="ellis")