diff --git a/config/invenio.conf b/config/invenio.conf index 3c8d13dbf..cba9be3b7 100644 --- a/config/invenio.conf +++ b/config/invenio.conf @@ -1,2470 +1,2475 @@ ## This file is part of Invenio. ## Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ################################################### ## About 'invenio.conf' and 'invenio-local.conf' ## ################################################### ## The 'invenio.conf' file contains the vanilla default configuration ## parameters of a Invenio installation, as coming out of the ## distribution. The file should be self-explanatory. Once installed ## in its usual location (usually /opt/invenio/etc), you could in ## principle go ahead and change the values according to your local ## needs, but this is not advised. ## ## If you would like to customize some of these parameters, you should ## rather create a file named 'invenio-local.conf' in the same ## directory where 'invenio.conf' lives and you should write there ## only the customizations that you want to be different from the ## vanilla defaults. ## ## Here is a realistic, minimalist, yet production-ready example of ## what you would typically put there: ## ## $ cat /opt/invenio/etc/invenio-local.conf ## [Invenio] ## CFG_SITE_NAME = John Doe's Document Server ## CFG_SITE_NAME_INTL_fr = Serveur des Documents de John Doe ## CFG_SITE_URL = http://your.site.com ## CFG_SITE_SECURE_URL = https://your.site.com ## CFG_SITE_ADMIN_EMAIL = john.doe@your.site.com ## CFG_SITE_SUPPORT_EMAIL = john.doe@your.site.com ## CFG_WEBALERT_ALERT_ENGINE_EMAIL = john.doe@your.site.com ## CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = john.doe@your.site.com ## CFG_WEBCOMMENT_DEFAULT_MODERATOR = john.doe@your.site.com ## CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL = john.doe@your.site.com ## CFG_BIBCATALOG_SYSTEM_EMAIL_ADDRESS = john.doe@your.site.com ## CFG_DATABASE_HOST = localhost ## CFG_DATABASE_NAME = invenio ## CFG_DATABASE_USER = invenio ## CFG_DATABASE_PASS = my123p$ss ## ## You should override at least the parameters mentioned above and the ## parameters mentioned in the `Part 1: Essential parameters' below in ## order to define some very essential runtime parameters such as the ## name of your document server (CFG_SITE_NAME and ## CFG_SITE_NAME_INTL_*), the visible URL of your document server ## (CFG_SITE_URL and CFG_SITE_SECURE_URL), the email address of the ## local Invenio administrator, comment moderator, and alert engine ## (CFG_SITE_SUPPORT_EMAIL, CFG_SITE_ADMIN_EMAIL, etc), and last but ## not least your database credentials (CFG_DATABASE_*). ## ## The Invenio system will then read both the default invenio.conf ## file and your customized invenio-local.conf file and it will ## override any default options with the ones you have specified in ## your local file. This cascading of configuration parameters will ## ease your future upgrades. [Invenio] ################################### ## Part 1: Essential parameters ## ################################### ## This part defines essential Invenio internal parameters that ## everybody should override, like the name of the server or the email ## address of the local Invenio administrator. ## CFG_DATABASE_* - specify which MySQL server to use, the name of the ## database to use, and the database access credentials. CFG_DATABASE_HOST = localhost CFG_DATABASE_PORT = 3306 CFG_DATABASE_NAME = invenio CFG_DATABASE_USER = invenio CFG_DATABASE_PASS = my123p$ss ## CFG_DATABASE_SLAVE - if you use DB replication, then specify the DB ## slave address credentials. (Assuming the same access rights to the ## DB slave as to the DB master.) If you don't use DB replication, ## then leave this option blank. CFG_DATABASE_SLAVE = ## CFG_DATABASE_SLAVE_SU_USER - is a special user that is authorized to ## detach/attach the slave to the master, thus allowing consistent dbdump ## on the slave database. CFG_DATABASE_SLAVE_SU_USER = slaveadmin CFG_DATABASE_SLAVE_SU_PASS = ## CFG_DATABASE_PASSWORD_FILE - if you want to store your password ## in a separate special file that is only readable by the dbquery user ## specify here the path. The file should contain one user per row, ## with the format: ## username // password ## i.e. with the username, the 4 characters " // ", and the password. CFG_DATABASE_PASSWORD_FILE = ## CFG_SITE_URL - specify URL under which your installation will be ## visible. For example, use "http://your.site.com". Do not leave ## trailing slash. CFG_SITE_URL = http://localhost ## CFG_SITE_SECURE_URL - specify secure URL under which your ## installation secure pages such as login or registration will be ## visible. For example, use "https://your.site.com". Do not leave ## trailing slash. If you don't plan on using HTTPS, then you may ## leave this empty. CFG_SITE_SECURE_URL = https://localhost ## CFG_SITE_NAME -- the visible name of your Invenio installation. CFG_SITE_NAME = Atlantis Institute of Fictive Science ## CFG_SITE_NAME_INTL -- the international versions of CFG_SITE_NAME ## in various languages. (See also CFG_SITE_LANGS below.) CFG_SITE_NAME_INTL_en = Atlantis Institute of Fictive Science CFG_SITE_NAME_INTL_fr = Atlantis Institut des Sciences Fictives CFG_SITE_NAME_INTL_de = Atlantis Institut der fiktiven Wissenschaft CFG_SITE_NAME_INTL_es = Instituto de Ciencia Ficticia Atlantis CFG_SITE_NAME_INTL_ca = Institut Atlantis de Ciència Fictícia CFG_SITE_NAME_INTL_pt = Instituto Atlantis de Ciência Fictícia CFG_SITE_NAME_INTL_it = Atlantis Istituto di Scienza Fittizia CFG_SITE_NAME_INTL_ru = Институт Фиктивных Наук Атлантиды CFG_SITE_NAME_INTL_sk = Atlantis Inštitút Fiktívnych Vied CFG_SITE_NAME_INTL_cs = Atlantis Institut Fiktivních Věd CFG_SITE_NAME_INTL_no = Atlantis Institutt for Fiktiv Vitenskap CFG_SITE_NAME_INTL_sv = Atlantis Institut för Fiktiv Vetenskap CFG_SITE_NAME_INTL_el = Ινστιτούτο Φανταστικών Επιστημών Ατλαντίδος CFG_SITE_NAME_INTL_uk = Інститут вигаданих наук в Атлантісі CFG_SITE_NAME_INTL_ja = Fictive 科学のAtlantis の協会 CFG_SITE_NAME_INTL_pl = Instytut Fikcyjnej Nauki Atlantis CFG_SITE_NAME_INTL_bg = Институт за фиктивни науки Атлантис CFG_SITE_NAME_INTL_hr = Institut Fiktivnih Znanosti Atlantis CFG_SITE_NAME_INTL_zh_CN = 阿特兰提斯虚拟科学学院 CFG_SITE_NAME_INTL_zh_TW = 阿特蘭提斯虛擬科學學院 CFG_SITE_NAME_INTL_hu = Kitalált Tudományok Atlantiszi Intézete CFG_SITE_NAME_INTL_af = Atlantis Instituut van Fiktiewe Wetenskap CFG_SITE_NAME_INTL_gl = Instituto Atlantis de Ciencia Fictive CFG_SITE_NAME_INTL_ro = Institutul Atlantis al Ştiinţelor Fictive CFG_SITE_NAME_INTL_rw = Atlantis Ishuri Rikuru Ry'ubuhanga CFG_SITE_NAME_INTL_ka = ატლანტიდის ფიქტიური მეცნიერების ინსტიტუტი CFG_SITE_NAME_INTL_lt = Fiktyvių Mokslų Institutas Atlantis CFG_SITE_NAME_INTL_ar = معهد أطلنطيس للعلوم الافتراضية CFG_SITE_NAME_INTL_fa = موسسه علوم تخیلی آتلانتیس ## CFG_SITE_LANG -- the default language of the interface: ' CFG_SITE_LANG = en ## CFG_SITE_LANGS -- list of all languages the user interface should ## be available in, separated by commas. The order specified below ## will be respected on the interface pages. A good default would be ## to use the alphabetical order. Currently supported languages ## include Afrikaans, Arabic, Bulgarian, Catalan, Czech, German, ## Georgian, Greek, English, Spanish, Persian (Farsi), French, ## Croatian, Hungarian, Galician, Italian, Japanese, Kinyarwanda, ## Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian, ## Slovak, Swedish, Ukrainian, Chinese (China), Chinese (Taiwan), so ## that the eventual maximum you can currently select is ## "af,ar,bg,ca,cs,de,el,en,es,fa,fr,hr,gl,ka,it,rw,lt,hu,ja,no,pl,pt,ro,ru,sk,sv,uk,zh_CN,zh_TW". CFG_SITE_LANGS = af,ar,bg,ca,cs,de,el,en,es,fa,fr,hr,gl,ka,it,rw,lt,hu,ja,no,pl,pt,ro,ru,sk,sv,uk,zh_CN,zh_TW ## CFG_SITE_SUPPORT_EMAIL -- the email address of the support team for ## this installation: CFG_SITE_SUPPORT_EMAIL = info@invenio-software.org ## CFG_SITE_ADMIN_EMAIL -- the email address of the 'superuser' for ## this installation. Enter your email address below and login with ## this address when using Invenio inistration modules. You ## will then be automatically recognized as superuser of the system. CFG_SITE_ADMIN_EMAIL = info@invenio-software.org ## CFG_SITE_EMERGENCY_EMAIL_ADDRESSES -- list of email addresses to ## which an email should be sent in case of emergency (e.g. bibsched ## queue has been stopped because of an error). Configuration ## dictionary allows for different recipients based on weekday and ## time-of-day. Example: ## ## CFG_SITE_EMERGENCY_EMAIL_ADDRESSES = { ## 'Sunday 22:00-06:00': '0041761111111@email2sms.foo.com', ## '06:00-18:00': 'team-in-europe@foo.com,0041762222222@email2sms.foo.com', ## '18:00-06:00': 'team-in-usa@foo.com', ## '*': 'john.doe.phone@foo.com'} ## ## If you want the emergency email notifications to always go to the ## same address, just use the wildcard line in the above example. CFG_SITE_EMERGENCY_EMAIL_ADDRESSES = {} ## CFG_SITE_ADMIN_EMAIL_EXCEPTIONS -- set this to 0 if you do not want ## to receive any captured exception via email to CFG_SITE_ADMIN_EMAIL ## address. Captured exceptions will still be available in ## var/log/invenio.err file. Set this to 1 if you want to receive ## some of the captured exceptions (this depends on the actual place ## where the exception is captured). Set this to 2 if you want to ## receive all captured exceptions. CFG_SITE_ADMIN_EMAIL_EXCEPTIONS = 1 ## CFG_SITE_RECORD -- what is the URI part representing detailed ## record pages? We recomment to leave the default value `record' ## unchanged. CFG_SITE_RECORD = record ## CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER -- set this to ## the number of seconds after which to reset the exception notification ## counter. A given repetitive exception is notified via email with a ## logarithmic strategy: the first time it is seen it is sent via email, ## then the second time, then the fourth, then the eighth and so forth. ## If the number of seconds elapsed since the last time it was notified ## is greater than CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER ## then the internal counter is reset in order not to have exception ## notification become more and more rare. CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER = 14400 ## CFG_CERN_SITE -- do we want to enable CERN-specific code? ## Put "1" for "yes" and "0" for "no". CFG_CERN_SITE = 0 ## CFG_INSPIRE_SITE -- do we want to enable INSPIRE-specific code? ## Put "1" for "yes" and "0" for "no". CFG_INSPIRE_SITE = 0 ## CFG_ADS_SITE -- do we want to enable ADS-specific code? ## Put "1" for "yes" and "0" for "no". CFG_ADS_SITE = 0 ## CFG_OPENAIRE_SITE -- do we want to enable OpenAIRE-specific code? ## Put "1" for "yes" and "0" for "no". CFG_OPENAIRE_SITE = 0 ## Now you can tune whether to integrate with external authentication providers ## through the OpenID and OAuth protocols. ## The following variables let you fine-tune which authentication ## providers you want to authorize. You can override here most of ## the variables in lib/invenio/access_control_config.py. ## In particular you can put in these variable the consumer_key and ## consumer_secret of the desired services. ## Note: some providers don't supply an mail address. ## If you choose them, the users will be registered with a temporary email address. ## CFG_OPENID_PROVIDERS -- Comma-separated list of providers you want to enable ## through the OpenID protocol. ## E.g.: CFG_OPENID_PROVIDERS = google,yahoo,aol,wordpress,myvidoop,openid,verisign,myopenid,myspace,livejournal,blogger CFG_OPENID_PROVIDERS = ## CFG_OAUTH1_PROVIDERS -- Comma-separated list of providers you want to enable ## through the OAuth1 protocol. ## Note: OAuth1 is in general deprecated in favour of OAuth2. ## E.g.: CFG_OAUTH1_PROVIDERS = twitter,linkedin,flickr, CFG_OAUTH1_PROVIDERS = ## CFG_OAUTH2_PROVIDERS -- Comma-separated list of providers you want to enable ## through the OAuth2 protocol. ## Note: if you enable the "orcid" provider the full profile of the user ## in Orcid will be imported. ## E.g.: CFG_OAUTH2_PROVIDERS = facebook,yammer,foursquare,googleoauth2,instagram,orcid CFG_OAUTH2_PROVIDERS = orcid ## CFG_OPENID_CONFIGURATIONS -- Mapping of special parameter to configure the ## desired OpenID providers. Use this variable to override out-of-the-box ## parameters already set in lib/python/invenio/access_control_config.py. ## E.g.: CFG_OPENID_CONFIGURATIONS = {'google': { ## 'identifier': 'https://www.google.com/accounts/o8/id', ## 'trust_email': True}} CFG_OPENID_CONFIGURATIONS = {} ## CFG_OAUTH1_CONFIGURATIONS -- Mapping of special parameter to configure the ## desired OAuth1 providers. Use this variable to override out-of-the-box ## parameters already set in lib/python/invenio/access_control_config.py. ## E.g.: CFG_OAUTH1_CONFIGURATIONS = {'linkedin': { ## 'consumer_key' : 'MY_LINKEDIN_CONSUMER_KEY', ## 'consumer_secret' : 'MY_LINKEDIN_CONSUMER_SECRET'}} CFG_OAUTH1_CONFIGURATIONS = {} ## CFG_OAUTH2_CONFIGURATIONS -- Mapping of special parameter to configure the ## desired OAuth2 providers. Use this variable to override out-of-the-box ## parameters already set in lib/python/invenio/access_control_config.py. ## E.g.: CFG_OAUTH2_CONFIGURATIONS = {'orcid': { ## 'consumer_key' : 'MY_ORCID_CONSUMER_KEY', ## 'consumer_secret' : 'MY_ORCID_CONSUMER_SECRET'}} CFG_OAUTH2_CONFIGURATIONS = {} ## CFG_DEVEL_SITE -- is this a development site? If it is, you might ## prefer that it does not do certain things. For example, you might ## not want WebSubmit to send certain emails or trigger certain ## processes on a development site. ## Put "1" for "yes" (this is a development site) or "0" for "no" ## (this isn't a development site.) CFG_DEVEL_SITE = 0 ## CFG_PROPAGATE_EXCEPTIONS -- Tells register_exception to propagate the ## exception instead of catching it and sending an email. Mostly for ## development, we have custom exceptions handlers that are handier than ## emails. CFG_PROPAGATE_EXCEPTIONS = 0 ################################ ## Part 2: Web page style ## ################################ ## The variables affecting the page style. The most important one is ## the 'template skin' you would like to use and the obfuscation mode ## for your email addresses. Please refer to the WebStyle Admin Guide ## for more explanation. The other variables are listed here mostly ## for backwards compatibility purposes only. ## CFG_WEBSTYLE_TEMPLATE_SKIN -- what template skin do you want to ## use? CFG_WEBSTYLE_TEMPLATE_SKIN = default ## CFG_WEBSTYLE_EMAIL_ADDRESSES_OBFUSCATION_MODE. How do we "protect" ## email addresses from undesired automated email harvesters? This ## setting will not affect 'support' and 'admin' emails. ## NOTE: there is no ultimate solution to protect against email ## harvesting. All have drawbacks and can more or less be ## circumvented. Choose you preferred mode ([t] means "transparent" ## for the user): ## -1: hide all emails. ## [t] 0 : no protection, email returned as is. ## foo@example.com => foo@example.com ## 1 : basic email munging: replaces @ by [at] and . by [dot] ## foo@example.com => foo [at] example [dot] com ## [t] 2 : transparent name mangling: characters are replaced by ## equivalent HTML entities. ## foo@example.com => foo@example.com ## [t] 3 : javascript insertion. Requires Javascript enabled on client ## side. ## 4 : replaces @ and . characters by gif equivalents. ## foo@example.com => foo [at] example [dot] com CFG_WEBSTYLE_EMAIL_ADDRESSES_OBFUSCATION_MODE = 2 ## CFG_WEBSTYLE_INSPECT_TEMPLATES -- Do we want to debug all template ## functions so that they would return HTML results wrapped in ## comments indicating which part of HTML page was created by which ## template function? Useful only for debugging Pythonic HTML ## template. See WebStyle Admin Guide for more information. CFG_WEBSTYLE_INSPECT_TEMPLATES = 0 ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXLEFTTOP -- eventual global HTML ## left top box: CFG_WEBSTYLE_CDSPAGEBOXLEFTTOP = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXLEFTBOTTOM -- eventual global ## HTML left bottom box: CFG_WEBSTYLE_CDSPAGEBOXLEFTBOTTOM = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXRIGHTTOP -- eventual global ## HTML right top box: CFG_WEBSTYLE_CDSPAGEBOXRIGHTTOP = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXRIGHTBOTTOM -- eventual global ## HTML right bottom box: CFG_WEBSTYLE_CDSPAGEBOXRIGHTBOTTOM = ## CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST -- when certain HTTP status ## codes are raised to the WSGI handler, the corresponding exceptions ## and error messages can be sent to the system administrator for ## inspecting. This is useful to detect and correct errors. The ## variable represents a comma-separated list of HTTP statuses that ## should alert admin. Wildcards are possible. If the status is ## followed by an "r", it means that a referer is required to exist ## (useful to distinguish broken known links from URL typos when 404 ## errors are raised). CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST = 404r,400,5*,41* ## CFG_WEBSTYLE_HTTP_USE_COMPRESSION -- whether to enable deflate ## compression of your HTTP/HTTPS connections. This will affect the Apache ## configuration snippets created by inveniocfg --create-apache-conf and ## the OAI-PMH Identify response. CFG_WEBSTYLE_HTTP_USE_COMPRESSION = 0 ## CFG_WEBSTYLE_REVERSE_PROXY_IPS -- if you are setting a multinode ## environment where an HTTP proxy such as mod_proxy is sitting in ## front of the Invenio web application and is forwarding requests to ## worker nodes, set here the the list of IP addresses of the allowed ## HTTP proxies. This is needed in order to avoid IP address spoofing ## when worker nodes are also available on the public Internet and ## might receive forged HTTP requests. Only HTTP requests coming from ## the specified IP addresses will be considered as forwarded from a ## reverse proxy. E.g. set this to '123.123.123.123'. CFG_WEBSTYLE_REVERSE_PROXY_IPS = ################################## ## Part 3: WebSearch parameters ## ################################## ## This section contains some configuration parameters for WebSearch ## module. Please note that WebSearch is mostly configured on ## run-time via its WebSearch Admin web interface. The parameters ## below are the ones that you do not probably want to modify very ## often during the runtime. (Note that you may modify them ## afterwards too, though.) ## CFG_WEBSEARCH_SEARCH_CACHE_SIZE -- how many queries we want to ## cache in memory per one Apache httpd process? This cache is used ## mainly for "next/previous page" functionality, but it caches also ## "popular" user queries if more than one user happen to search for ## the same thing. Note that large numbers may lead to great memory ## consumption. We recommend a value not greater than 100. CFG_WEBSEARCH_SEARCH_CACHE_SIZE = 0 ## CFG_WEBSEARCH_FIELDS_CONVERT -- if you migrate from an older ## system, you may want to map field codes of your old system (such as ## 'ti') to Invenio/MySQL ("title"). Use Python dictionary syntax ## for the translation table, e.g. {'wau':'author', 'wti':'title'}. ## Usually you don't want to do that, and you would use empty dict {}. CFG_WEBSEARCH_FIELDS_CONVERT = {} ## CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH -- width of the ## search pattern window in the light search interface, in ## characters. CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH = 60 CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH = 60 ## CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH -- width of the search ## pattern window in the simple search interface, in characters. CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH = 40 ## CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH -- width of the ## search pattern window in the advanced search interface, in ## characters. CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH = 30 ## CFG_WEBSEARCH_NB_RECORDS_TO_SORT -- how many records do we still ## want to sort? For higher numbers we print only a warning and won't ## perform any sorting other than default 'latest records first', as ## sorting would be very time consuming then. We recommend a value of ## not more than a couple of thousands. CFG_WEBSEARCH_NB_RECORDS_TO_SORT = 1000 ## CFG_WEBSEARCH_CALL_BIBFORMAT -- if a record is being displayed but ## it was not preformatted in the "HTML brief" format, do we want to ## call BibFormatting on the fly? Put "1" for "yes" and "0" for "no". ## Note that "1" will display the record exactly as if it were fully ## preformatted, but it may be slow due to on-the-fly processing; "0" ## will display a default format very fast, but it may not have all ## the fields as in the fully preformatted HTML brief format. Note ## also that this option is active only for old (PHP) formats; the new ## (Python) formats are called on the fly by default anyway, since ## they are much faster. When usure, please set "0" here. CFG_WEBSEARCH_CALL_BIBFORMAT = 0 ## CFG_WEBSEARCH_USE_ALEPH_SYSNOS -- do we want to make old SYSNOs ## visible rather than MySQL's record IDs? You may use this if you ## migrate from a different e-doc system, and you store your old ## system numbers into 970__a. Put "1" for "yes" and "0" for ## "no". Usually you don't want to do that, though. CFG_WEBSEARCH_USE_ALEPH_SYSNOS = 0 ## CFG_WEBSEARCH_I18N_LATEST_ADDITIONS -- Put "1" if you want the ## "Latest Additions" in the web collection pages to show ## internationalized records. Useful only if your brief BibFormat ## templates contains internationalized strings. Otherwise put "0" in ## order not to slow down the creation of latest additions by WebColl. CFG_WEBSEARCH_I18N_LATEST_ADDITIONS = 0 ## CFG_WEBSEARCH_INSTANT_BROWSE -- the number of records to display ## under 'Latest Additions' in the web collection pages. CFG_WEBSEARCH_INSTANT_BROWSE = 10 ## CFG_WEBSEARCH_INSTANT_BROWSE_RSS -- the number of records to ## display in the RSS feed. CFG_WEBSEARCH_INSTANT_BROWSE_RSS = 25 ## CFG_WEBSEARCH_RSS_I18N_COLLECTIONS -- comma-separated list of ## collections that feature an internationalized RSS feed on their ## main seach interface page created by webcoll. Other collections ## will have RSS feed using CFG_SITE_LANG. CFG_WEBSEARCH_RSS_I18N_COLLECTIONS = ## CFG_WEBSEARCH_RSS_TTL -- number of minutes that indicates how long ## a feed cache is valid. CFG_WEBSEARCH_RSS_TTL = 360 ## CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS -- maximum number of request kept ## in cache. If the cache is filled, following request are not cached. CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS = 1000 ## CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD -- up to how many author names ## to print explicitely; for more print "et al". Note that this is ## used in default formatting that is seldomly used, as usually ## BibFormat defines all the format. The value below is only used ## when BibFormat fails, for example. CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD = 3 ## CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS -- whether to show or ## not collection grandsons in Narrow Search boxes (sons are shown by ## default, grandsons are configurable here). Use 0 for no and 1 for ## yes. CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS = 1 ## CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX -- shall we ## create help links for Ellis, Nick or Ellis, Nicholas and friends ## when Ellis, N was searched for? Useful if you have one author ## stored in the database under several name formats, namely surname ## comma firstname and surname comma initial cataloging policy. Use 0 ## for no and 1 for yes. CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX = 1 ## CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS -- MathJax is a JavaScript ## library that renders (La)TeX mathematical formulas in the client ## browser. This parameter must contain a comma-separated list of ## output formats for which to apply the MathJax rendering, for example ## "hb,hd". If the list is empty, MathJax is disabled. ## See also CFG_WEBSUBMIT_USE_MATHJAX CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS = ## CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT -- when searching ## external collections (e.g. SPIRES, CiteSeer, etc), how many seconds ## do we wait for reply before abandonning? CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT = 5 ## CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_MAXRESULTS -- how many ## results do we fetch? CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_MAXRESULTS = 10 ## CFG_WEBSEARCH_SPLIT_BY_COLLECTION -- do we want to split the search ## results by collection or not? Use 0 for not, 1 for yes. CFG_WEBSEARCH_SPLIT_BY_COLLECTION = 1 ## CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS -- the default number of ## records to display per page in the search results pages. CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS = 10 ## CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS -- in order to limit denial of ## service attacks the total number of records per group displayed as a ## result of a search query will be limited to this number. Only the superuser ## queries will not be affected by this limit. CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS = 200 ## CFG_WEBSEARCH_SHOW_COMMENT_COUNT -- do we want to show the 'N comments' ## links on the search engine pages? (useful only when you have allowed ## commenting) CFG_WEBSEARCH_SHOW_COMMENT_COUNT = 1 ## CFG_WEBSEARCH_SHOW_REVIEW_COUNT -- do we want to show the 'N reviews' ## links on the search engine pages? (useful only when you have allowed ## reviewing) CFG_WEBSEARCH_SHOW_REVIEW_COUNT = 1 ## CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR -- how do we want to generate ## fulltext? Can be generated by 'native' Invenio or 'SOLR' CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR = native ## CFG_WEBSEARCH_FULLTEXT_SNIPPETS -- how many full-text snippets do ## we want to display for full-text searches? If you want to specify ## different values for different document status types, please add ## more items into this dictionary. (Unless specified, the empty ## value will be used as default.) This is useful if you have ## restricted files of different types with various restrictions on ## what we can show. CFG_WEBSEARCH_FULLTEXT_SNIPPETS = { '': 4, } ## CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS -- what is the maximum size ## of a snippet to display around the pattern found in the full-text? ## If you want to specify different values for different document ## status types, please add more items into this dictionary. (Unless ## specified, the empty value will be used as default.) This is ## useful if you have restricted files of different types with various ## restrictions on what we can show. CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS = { '': 100, } ## CFG_WEBSEARCH_WILDCARD_LIMIT -- some of the queries, wildcard ## queries in particular (ex: cern*, a*), but also regular expressions ## (ex: [a-z]+), may take a long time to respond due to the high ## number of hits. You can limit the number of terms matched by a ## wildcard by setting this variable. A negative value or zero means ## that none of the queries will be limited (which may be wanted by ## also prone to denial-of-service kind of attacks). CFG_WEBSEARCH_WILDCARD_LIMIT = 50000 ## CFG_WEBSEARCH_SYNONYM_KBRS -- defines which knowledge bases are to ## be used for which index in order to provide runtime synonym lookup ## of user-supplied terms, and what massaging function should be used ## upon search pattern before performing the KB lookup. (Can be one ## of `exact', 'leading_to_comma', `leading_to_number'.) CFG_WEBSEARCH_SYNONYM_KBRS = { 'journal': ['SEARCH-SYNONYM-JOURNAL', 'leading_to_number'], } ## CFG_SOLR_URL -- optionally, you may use Solr to serve full-text ## queries and ranking. If so, please specify the URL of your Solr instance. ## Example: http://localhost:8983/solr (default solr port) CFG_SOLR_URL = ## CFG_XAPIAN_ENABLED -- optionally, you may use Xapian to serve full-text ## queries and ranking. If so, please enable it: 1 = enabled CFG_XAPIAN_ENABLED = ## CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT -- specify the limit when ## the previous/next/back hit links are to be displayed on detailed record pages. ## In order to speeding up list manipulations, if a search returns lots of hits, ## more than this limit, then do not loose time calculating next/previous/back ## hits at all, but display page directly without these. ## Note also that Invenio installations that do not like ## to have the next/previous hit link functionality would be able to set this ## variable to zero and not see anything. CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT = 1000 ## CFG_WEBSEARCH_PREV_NEXT_HIT_FOR_GUESTS -- Set this to 0 if you want ## to disable the previous/next/back hit link functionality for guests ## users. ## Since the previous/next/back hit link functionality is causing the allocation ## of user session in the database even for guests users, it might be useful to ## be able to disable it e.g. when your site is bombarded by web request ## (a.k.a. Slashdot effect). CFG_WEBSEARCH_PREV_NEXT_HIT_FOR_GUESTS = 1 ## CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY -- when a record belongs to more than one ## restricted collection, if the viewrestcoll policy is set to "ALL" (default) ## then the user must be authorized to all the restricted collections, in ## order to be granted access to the specific record. If the policy is set to ## "ANY", then the user need to be authorized to only one of the collections ## in order to be granted access to the specific record. CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY = ANY ## CFG_WEBSEARCH_SPIRES_SYNTAX -- variable to configure the use of the ## SPIRES query syntax in searches. Values: 0 = SPIRES syntax is ## switched off; 1 = leading 'find' is required; 9 = leading 'find' is ## not required (leading SPIRES operator, space-operator-space, etc ## are also accepted). CFG_WEBSEARCH_SPIRES_SYNTAX = 1 ## CFG_WEBSEARCH_DISPLAY_NEAREST_TERMS -- when user search does not ## return any direct result, what do we want to display? Set to 0 in ## order to display a generic message about search returning no hits. ## Set to 1 in order to display list of nearest terms from the indexes ## that may match user query. Note: this functionality may be slow, ## so you may want to disable it on bigger sites. CFG_WEBSEARCH_DISPLAY_NEAREST_TERMS = 1 ## CFG_WEBSEARCH_DETAILED_META_FORMAT -- the output format to use for ## detailed meta tags containing metadata as configured in the tag ## table. Default output format should be 'hdm', included. This ## format will be included in the header of /record/ pages. For ## efficiency this format should be pre-cached with BibReformat. See ## also CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR and ## CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR. CFG_WEBSEARCH_DETAILED_META_FORMAT = hdm ## CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR -- decides if meta tags for ## Google Scholar shall be included in the detailed record page ## header, when using the standard formatting templates/elements. See ## also CFG_WEBSEARCH_DETAILED_META_FORMAT and ## CFG_WEBSEARCH_ENABLE_OPENGRAPH. When this variable is changed and ## output format defined in CFG_WEBSEARCH_DETAILED_META_FORMAT is ## cached, a bibreformat must be run for the cached records. CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR = True ## CFG_WEBSEARCH_ENABLE_OPENGRAPH -- decides if meta tags for the Open ## Graph protocol shall be included in the detailed record page ## header, when using the standard formatting templates/elements. See ## also CFG_WEBSEARCH_DETAILED_META_FORMAT and ## CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR. When this variable is changed ## and output format defined in CFG_WEBSEARCH_DETAILED_META_FORMAT is ## cached, a bibreformat must be run for the cached records. Note that ## enabling Open Graph produces invalid XHTML/HTML5 markup. CFG_WEBSEARCH_ENABLE_OPENGRAPH = False ## CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD -- switches to filter a presorted ## self-citations list when # of recids is bigger than this CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD = 20000 ## CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH -- decides whether search for ## collection name is enabled (1), disabled (-1) or enabled only for ## the home collection (0), enabled for all collections including ## those not attached to the collection tree (2). This requires the ## CollectionNameSearchService search services to be enabled. CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH = 0 ####################################### ## Part 4: BibHarvest OAI parameters ## ####################################### ## This part defines parameters for the Invenio OAI gateway. ## Useful if you are running Invenio as OAI data provider. ## CFG_OAI_ID_FIELD -- OAI identifier MARC field: CFG_OAI_ID_FIELD = 909COo ## CFG_OAI_SET_FIELD -- OAI set MARC field: CFG_OAI_SET_FIELD = 909COp ## CFG_OAI_SET_FIELD -- previous OAI set MARC field: CFG_OAI_PREVIOUS_SET_FIELD = 909COq ## CFG_OAI_DELETED_POLICY -- OAI deletedrecordspolicy ## (no/transient/persistent): CFG_OAI_DELETED_POLICY = persistent ## CFG_OAI_ID_PREFIX -- OAI identifier prefix: CFG_OAI_ID_PREFIX = atlantis.cern.ch ## CFG_OAI_SAMPLE_IDENTIFIER -- OAI sample identifier: CFG_OAI_SAMPLE_IDENTIFIER = oai:atlantis.cern.ch:123 ## CFG_OAI_IDENTIFY_DESCRIPTION -- description for the OAI Identify verb: CFG_OAI_IDENTIFY_DESCRIPTION = %(CFG_SITE_URL)s Free and unlimited use by anybody with obligation to refer to original record Full content, i.e. preprints may not be harvested by robots Submission restricted. Submitted documents are subject of approval by OAI repository admins. ## CFG_OAI_LOAD -- OAI number of records in a response: CFG_OAI_LOAD = 500 ## CFG_OAI_EXPIRE -- OAI resumptionToken expiration time: CFG_OAI_EXPIRE = 90000 ## CFG_OAI_SLEEP -- service unavailable between two consecutive ## requests for CFG_OAI_SLEEP seconds: CFG_OAI_SLEEP = 2 ## CFG_OAI_METADATA_FORMATS -- mapping between accepted metadataPrefixes and ## the corresponding output format to use, its schema and its metadataNamespace. CFG_OAI_METADATA_FORMATS = { 'marcxml': ('XOAIMARC', 'http://www.openarchives.org/OAI/1.1/dc.xsd', 'http://purl.org/dc/elements/1.1/'), 'oai_dc': ('XOAIDC', 'http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd', 'http://www.loc.gov/MARC21/slim'), } ## CFG_OAI_FRIENDS -- list of OAI baseURL of friend repositories. See: ## CFG_OAI_FRIENDS = http://cds.cern.ch/oai2d,http://openaire.cern.ch/oai2d,http://export.arxiv.org/oai2 ## The following subfields are a completition to ## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. If CFG_OAI_PROVENANCE_BASEURL_SUBFIELD is ## set for a record, then the corresponding field is considered has being ## harvested via OAI-PMH ## CFG_OAI_PROVENANCE_BASEURL_SUBFIELD -- baseURL of the originDescription or a ## record CFG_OAI_PROVENANCE_BASEURL_SUBFIELD = u ## CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD -- datestamp of the originDescription ## or a record CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD = d ## CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD -- metadataNamespace of the ## originDescription or a record CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD = m ## CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD -- originDescription of the ## originDescription or a record CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD = d ## CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD -- harvestDate of the ## originDescription or a record CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD = h ## CFG_OAI_PROVENANCE_ALTERED_SUBFIELD -- altered flag of the ## originDescription or a record CFG_OAI_PROVENANCE_ALTERED_SUBFIELD = t ## CFG_OAI_FAILED_HARVESTING_STOP_QUEUE -- when harvesting OAI sources ## fails, shall we report an error with the task and stop BibSched ## queue, or simply wait for the next run of the task? A value of 0 ## will stop the task upon errors, 1 will let the queue run if the ## next run of the oaiharvest task can safely recover the failure ## (this means that the queue will stop if the task is not set to run ## periodically) CFG_OAI_FAILED_HARVESTING_STOP_QUEUE = 1 ## CFG_OAI_FAILED_HARVESTING_EMAILS_ADMIN -- when ## CFG_OAI_FAILED_HARVESTING_STOP_QUEUE is set to leave the queue ## running upon errors, shall we send an email to admin to notify ## about the failure? CFG_OAI_FAILED_HARVESTING_EMAILS_ADMIN = True ## NOTE: the following parameters are experimenta ## ----------------------------------------------------------------------------- ## CFG_OAI_RIGHTS_FIELD -- MARC field dedicated to storing Copyright information CFG_OAI_RIGHTS_FIELD = 542__ ## CFG_OAI_RIGHTS_HOLDER_SUBFIELD -- MARC subfield dedicated to storing the ## Copyright holder information CFG_OAI_RIGHTS_HOLDER_SUBFIELD = d ## CFG_OAI_RIGHTS_DATE_SUBFIELD -- MARC subfield dedicated to storing the ## Copyright date information CFG_OAI_RIGHTS_DATE_SUBFIELD = g ## CFG_OAI_RIGHTS_URI_SUBFIELD -- MARC subfield dedicated to storing the URI ## (URL or URN, more detailed statement about copyright status) information CFG_OAI_RIGHTS_URI_SUBFIELD = u ## CFG_OAI_RIGHTS_CONTACT_SUBFIELD -- MARC subfield dedicated to storing the ## Copyright holder contact information CFG_OAI_RIGHTS_CONTACT_SUBFIELD = e ## CFG_OAI_RIGHTS_STATEMENT_SUBFIELD -- MARC subfield dedicated to storing the ## Copyright statement as presented on the resource CFG_OAI_RIGHTS_STATEMENT_SUBFIELD = f ## CFG_OAI_LICENSE_FIELD -- MARC field dedicated to storing terms governing ## use and reproduction (license) CFG_OAI_LICENSE_FIELD = 540__ ## CFG_OAI_LICENSE_TERMS_SUBFIELD -- MARC subfield dedicated to storing the ## Terms governing use and reproduction, e.g. CC License CFG_OAI_LICENSE_TERMS_SUBFIELD = a ## CFG_OAI_LICENSE_PUBLISHER_SUBFIELD -- MARC subfield dedicated to storing the ## person or institute imposing the license (author, publisher) CFG_OAI_LICENSE_PUBLISHER_SUBFIELD = b ## CFG_OAI_LICENSE_URI_SUBFIELD -- MARC subfield dedicated to storing the URI ## URI CFG_OAI_LICENSE_URI_SUBFIELD = u ##------------------------------------------------------------------------------ ################################### ## Part 5: BibDocFile parameters ## ################################### ## This section contains some configuration parameters for BibDocFile ## module. ## CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_DOCTYPES -- this is the list of ## doctypes (like 'Main' or 'Additional') and their description that admins ## can choose from when adding new files via the Document File Manager ## admin interface. ## - When no value is provided, admins cannot add new ## file (they can only revise/delete/add format) ## - When a single value is given, it is used as ## default doctype for all new documents ## ## Order is relevant ## Eg: ## [('main', 'Main document'), ('additional', 'Figure, schema. etc')] CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_DOCTYPES = [ ('Main', 'Main document'), ('LaTeX', 'LaTeX'), ('Source', 'Source'), ('Additional', 'Additional File'), ('Audio', 'Audio file'), ('Video', 'Video file'), ('Script', 'Script'), ('Data', 'Data'), ('Figure', 'Figure'), ('Schema', 'Schema'), ('Graph', 'Graph'), ('Image', 'Image'), ('Drawing', 'Drawing'), ('Slides', 'Slides')] ## CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_RESTRICTIONS -- this is the ## list of restrictions (like 'Restricted' or 'No Restriction') and their ## description that admins can choose from when adding or revising files. ## Restrictions can then be configured at the level of WebAccess. ## - When no value is provided, no restriction is ## applied ## - When a single value is given, it is used as ## default resctriction for all documents. ## - The first value of the list is used as default ## restriction if the user if not given the ## choice of the restriction. Order is relevant ## ## Eg: ## [('', 'No restriction'), ('restr', 'Restricted')] CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_RESTRICTIONS = [ ('', 'Public'), ('restricted', 'Restricted')] ## CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_MISC -- set here the other ## default flags and attributes to tune the Document File Manager admin ## interface. ## See the docstring of bibdocfile_managedocfiles.create_file_upload_interface ## to have a description of the available parameters and their syntax. ## In general you will rarely need to change this variable. CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_MISC = { 'can_revise_doctypes': ['*'], 'can_comment_doctypes': ['*'], 'can_describe_doctypes': ['*'], 'can_delete_doctypes': ['*'], 'can_keep_doctypes': ['*'], 'can_rename_doctypes': ['*'], 'can_add_format_to_doctypes': ['*'], 'can_restrict_doctypes': ['*'], } ## CFG_BIBDOCFILE_FILESYSTEM_BIBDOC_GROUP_LIMIT -- the fulltext ## documents are stored under "/opt/invenio/var/data/files/gX/Y" ## directories where X is 0,1,... and Y stands for bibdoc ID. Thusly ## documents Y are grouped into directories X and this variable ## indicates the maximum number of documents Y stored in each ## directory X. This limit is imposed solely for filesystem ## performance reasons in order not to have too many subdirectories in ## a given directory. CFG_BIBDOCFILE_FILESYSTEM_BIBDOC_GROUP_LIMIT = 5000 ## CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS -- a comma-separated ## list of document extensions not listed in Python standard mimetype ## library that should be recognized by Invenio. CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS = hpg,link,lis,llb,mat,mpp,msg,docx,docm,xlsx,xlsm,xlsb,pptx,pptm,ppsx,ppsm ## CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES -- a mapping of additional ## mimetypes that could be served or have to be recognized by this instance ## of Invenio (this is useful in order to patch old versions of the ## mimetypes Python module). CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES = { "application/xml-dtd": ".dtd", } ## CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING -- the mimetype Python ## provides a mapping between mimetypes and file extensions. ## Unfortunately this is a one to many mapping and the system ## have to pick one extension for a given mimetype. ## This parameter let you specify the actual desired ## mapping that is going to override the mimetype one. CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING = { 'application/msword': '.doc', 'application/octet-stream': '.bin', 'application/postscript': '.ps', 'application/vnd.ms-excel': '.xls', 'application/vnd.ms-powerpoint': '.ppt', 'application/x-gtar-compressed': '.tgz', 'application/xhtml+xml': '.xhtml', 'application/xml': '.xml', 'audio/mpeg': '.mp3', 'audio/ogg': '.ogg', 'image/jpeg': '.jpeg', 'image/svg+xml': '.svg', 'image/tiff': '.tiff', 'message/rfc822': '.eml', 'text/calendar': '.ics', 'text/plain': '.txt', 'video/mpeg': '.mpeg', } ## CFG_BIBDOCFILE_DESIRED_CONVERSIONS -- a dictionary having as keys ## a format and as values the corresponding list of desired converted ## formats. CFG_BIBDOCFILE_DESIRED_CONVERSIONS = { 'pdf' : ('pdf;pdfa', ), 'ps.gz' : ('pdf;pdfa', ), 'djvu' : ('pdf', ), 'sxw': ('doc', 'odt', 'pdf;pdfa', ), 'docx' : ('doc', 'odt', 'pdf;pdfa', ), 'doc' : ('odt', 'pdf;pdfa', 'docx'), 'rtf' : ('pdf;pdfa', 'odt', ), 'odt' : ('pdf;pdfa', 'doc', ), 'pptx' : ('ppt', 'odp', 'pdf;pdfa', ), 'ppt' : ('odp', 'pdf;pdfa', 'pptx'), 'sxi': ('odp', 'pdf;pdfa', ), 'odp' : ('pdf;pdfa', 'ppt', ), 'xlsx' : ('xls', 'ods', 'csv'), 'xls' : ('ods', 'csv'), 'ods' : ('xls', 'xlsx', 'csv'), 'sxc': ('xls', 'xlsx', 'csv'), 'tiff' : ('pdf;pdfa', ), 'tif' : ('pdf;pdfa', ),} ## CFG_BIBDOCFILE_USE_XSENDFILE -- if your web server supports ## XSendfile header, you may want to enable this feature in order for ## to Invenio tell the web server to stream files for download (after ## proper authorization checks) by web server's means. This helps to ## liberate Invenio worker processes from being busy with sending big ## files to clients. The web server will take care of that. Note: ## this feature is still somewhat experimental. Note: when enabled ## (set to 1), then you have to also regenerate Apache vhost conf ## snippets (inveniocfg --update-config-py --create-apache-conf). CFG_BIBDOCFILE_USE_XSENDFILE = 0 ## CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY -- a number between 0 and ## 1 that indicates probability with which MD5 checksum will be ## verified when streaming bibdocfile-managed files. (0.1 will cause ## the check to be performed once for every 10 downloads) CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY = 0.1 ## CFG_BIBDOCFILE_BEST_FORMATS_TO_EXTRACT_TEXT_FROM -- a comma-separated ## list of document extensions in decrescent order of preference ## to suggest what is considered the best format to extract text from. CFG_BIBDOCFILE_BEST_FORMATS_TO_EXTRACT_TEXT_FROM = ('txt', 'html', 'xml', 'odt', 'doc', 'docx', 'djvu', 'pdf', 'ps', 'ps.gz') ## CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE -- whether to use the ## database table bibdocfsinfo as reference for filesystem ## information. The default is 0. Switch this to 1 ## after you have run bibdocfile --fix-bibdocfsinfo-cache ## or on an empty system. CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE = 0 ## CFG_BIBDOCFILE_AFS_VOLUME_PATTERN -- If documents are going to be stored ## on the AFS filesystem (e.g. in the case of the CDS and Inspire projects), ## this is the pattern to be used to create a volumes to be mounted when ## allocating new gXX directories. (e.g. p.cds.%s or p.inspire.%s). ## Note: this parameter is considered only if either CFG_CERN_SITE or ## CFG_INSPIRE_SITE is set to 1. ## See: CERN specific afs_admin man page for more information. CFG_BIBDOCFILE_AFS_VOLUME_PATTERN = p.invenio.%s ## CFG_BIBDOCFILE_AFS_VOLUME_QUOTA -- If documents are going to be stored on ## the AFS filesystem (see CFG_BIBDOCFILE_AFS_VOLUME_PATTERN), this is the ## initial quota automatically allocated when creating new volumes for gXX ## directories. ## Note: this parameter is considered only if either CFG_CERN_SITE or ## CFG_INSPIRE_SITE is set to 1. ## See: CERN specific afs_admin man page for more information. CFG_BIBDOCFILE_AFS_VOLUME_QUOTA = 10000000 ## CFG_OPENOFFICE_SERVER_HOST -- the host where an OpenOffice Server is ## listening to. If localhost an OpenOffice server will be started ## automatically if it is not already running. ## Note: if you set this to an empty value this will disable the usage of ## OpenOffice for converting documents. ## If you set this to something different than localhost you'll have to take ## care to have an OpenOffice server running on the corresponding host and ## to install the same OpenOffice release both on the client and on the server ## side. ## In order to launch an OpenOffice server on a remote machine, just start ## the usual 'soffice' executable in this way: ## $> soffice -headless -nologo -nodefault -norestore -nofirststartwizard \ ## .. -accept=socket,host=HOST,port=PORT;urp;StarOffice.ComponentContext CFG_OPENOFFICE_SERVER_HOST = localhost ## CFG_OPENOFFICE_SERVER_PORT -- the port where an OpenOffice Server is ## listening to. CFG_OPENOFFICE_SERVER_PORT = 2002 ## CFG_OPENOFFICE_USER -- the user that will be used to launch the OpenOffice ## client. It is recommended to set this to a user who don't own files, like ## e.g. 'nobody'. You should also authorize your Apache server user to be ## able to become this user, e.g. by adding to your /etc/sudoers the following ## line: ## "apache ALL=(nobody) NOPASSWD: ALL" ## provided that apache is the username corresponding to the Apache user. ## On some machine this might be apache2 or www-data. CFG_OPENOFFICE_USER = nobody ################################# ## Part 6: BibIndex parameters ## ################################# ## This section contains some configuration parameters for BibIndex ## module. Please note that BibIndex is mostly configured on run-time ## via its BibIndex Admin web interface. The parameters below are the ## ones that you do not probably want to modify very often during the ## runtime. ## CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY -- when fulltext indexing, do ## you want to index locally stored files only, or also external URLs? ## Use "0" to say "no" and "1" to say "yes". CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY = 1 ## (deprecated) CFG_BIBINDEX_REMOVE_STOPWORDS -- configuration moved to ## DB, variable kept here just for backwards compatibility purposes. CFG_BIBINDEX_REMOVE_STOPWORDS = ## CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS -- characters considered as ## alphanumeric separators of word-blocks inside words. You probably ## don't want to change this. CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS = \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ ## CFG_BIBINDEX_CHARS_PUNCTUATION -- characters considered as punctuation ## between word-blocks inside words. You probably don't want to ## change this. CFG_BIBINDEX_CHARS_PUNCTUATION = \.\,\:\;\?\!\" ## (deprecated) CFG_BIBINDEX_REMOVE_HTML_MARKUP -- now in database CFG_BIBINDEX_REMOVE_HTML_MARKUP = 0 ## (deprecated) CFG_BIBINDEX_REMOVE_LATEX_MARKUP -- now in database CFG_BIBINDEX_REMOVE_LATEX_MARKUP = 0 ## CFG_BIBINDEX_MIN_WORD_LENGTH -- minimum word length allowed to be added to ## index. The terms smaller then this amount will be discarded. ## Useful to keep the database clean, however you can safely leave ## this value on 0 for up to 1,000,000 documents. CFG_BIBINDEX_MIN_WORD_LENGTH = 0 ## CFG_BIBINDEX_URLOPENER_USERNAME and CFG_BIBINDEX_URLOPENER_PASSWORD -- ## access credentials to access restricted URLs, interesting only if ## you are fulltext-indexing files located on a remote server that is ## only available via username/password. But it's probably better to ## handle this case via IP or some convention; the current scheme is ## mostly there for demo only. CFG_BIBINDEX_URLOPENER_USERNAME = mysuperuser CFG_BIBINDEX_URLOPENER_PASSWORD = mysuperpass ## CFG_INTBITSET_ENABLE_SANITY_CHECKS -- ## Enable sanity checks for integers passed to the intbitset data ## structures. It is good to enable this during debugging ## and to disable this value for speed improvements. CFG_INTBITSET_ENABLE_SANITY_CHECKS = False ## CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES -- regular expression that matches ## docnames for which OCR is desired (set this to .* in order to enable ## OCR in general, set this to empty in order to disable it.) CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES = scan-.* ## CFG_BIBINDEX_SPLASH_PAGES -- key-value mapping where the key corresponds ## to a regular expression that matches the URLs of the splash pages of ## a given service and the value is a regular expression of the set of URLs ## referenced via tags in the HTML content of the splash pages that are ## referring to documents that need to be indexed. ## NOTE: for backward compatibility reasons you can set this to a simple ## regular expression that will directly be used as the unique key of the ## map, with corresponding value set to ".*" (in order to match any URL) CFG_BIBINDEX_SPLASH_PAGES = { "http://documents\.cern\.ch/setlink\?.*": ".*", "http://ilcagenda\.linearcollider\.org/subContributionDisplay\.py\?.*|http://ilcagenda\.linearcollider\.org/contributionDisplay\.py\?.*": "http://ilcagenda\.linearcollider\.org/getFile\.py/access\?.*|http://ilcagenda\.linearcollider\.org/materialDisplay\.py\?.*", } ## CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES -- do we want ## the author word index to exclude first names to keep only last ## names? If set to True, then for the author `Bernard, Denis', only ## `Bernard' will be indexed in the word index, not `Denis'. Note ## that if you change this variable, you have to re-index the author ## index via `bibindex -w author -R'. CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES = False ## (deprecated) CFG_BIBINDEX_SYNONYM_KBRS -- configuration moved to ## DB, variable kept here just for backwards compatibility purposes. CFG_BIBINDEX_SYNONYM_KBRS = {} ####################################### ## Part 7: Access control parameters ## ####################################### ## This section contains some configuration parameters for the access ## control system. Please note that WebAccess is mostly configured on ## run-time via its WebAccess Admin web interface. The parameters ## below are the ones that you do not probably want to modify very ## often during the runtime. (If you do want to modify them during ## runtime, for example te deny access temporarily because of backups, ## you can edit access_control_config.py directly, no need to get back ## here and no need to redo the make process.) ## CFG_ACCESS_CONTROL_LEVEL_SITE -- defines how open this site is. ## Use 0 for normal operation of the site, 1 for read-only site (all ## write operations temporarily closed), 2 for site fully closed, ## 3 for also disabling any database connection. ## Useful for site maintenance. CFG_ACCESS_CONTROL_LEVEL_SITE = 0 ## CFG_ACCESS_CONTROL_LEVEL_GUESTS -- guest users access policy. Use ## 0 to allow guest users, 1 not to allow them (all users must login). CFG_ACCESS_CONTROL_LEVEL_GUESTS = 0 ## CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS -- account registration and ## activation policy. When 0, users can register and accounts are ## automatically activated. When 1, users can register but admin must ## activate the accounts. When 2, users cannot register nor update ## their email address, only admin can register accounts. When 3, ## users cannot register nor update email address nor password, only ## admin can register accounts. When 4, the same as 3 applies, nor ## user cannot change his login method. When 5, then the same as 4 ## applies, plus info about how to get an account is hidden from the ## login page. CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS = 0 ## CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN -- limit account ## registration to certain email addresses? If wanted, give domain ## name below, e.g. "cern.ch". If not wanted, leave it empty. CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN = ## CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS -- send a ## notification email to the administrator when a new account is ## created? Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS = 0 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT -- send a ## notification email to the user when a new account is created in order to ## to verify the validity of the provided email address? Use ## 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT = 1 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_ACTIVATION -- send a ## notification email to the user when a new account is activated? ## Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_ACTIVATION = 0 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_DELETION -- send a ## notification email to the user when a new account is deleted or ## account demand rejected? Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_DELETION = 0 ## CFG_APACHE_PASSWORD_FILE -- the file where Apache user credentials ## are stored. Must be an absolute pathname. If the value does not ## start by a slash, it is considered to be the filename of a file ## located under prefix/var/tmp directory. This is useful for the ## demo site testing purposes. For the production site, if you plan ## to restrict access to some collections based on the Apache user ## authentication mechanism, you should put here an absolute path to ## your Apache password file. CFG_APACHE_PASSWORD_FILE = demo-site-apache-user-passwords ## CFG_APACHE_GROUP_FILE -- the file where Apache user groups are ## defined. See the documentation of the preceding config variable. CFG_APACHE_GROUP_FILE = demo-site-apache-user-groups ################################### ## Part 8: WebSession parameters ## ################################### ## This section contains some configuration parameters for tweaking ## session handling. ## CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT -- number of days after which a session ## and the corresponding cookie is considered expired. CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT = 2 ## CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER -- number of days after which a session ## and the corresponding cookie is considered expired, when the user has ## requested to permanently stay logged in. CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER = 365 ## CFG_WEBSESSION_RESET_PASSWORD_EXPIRE_IN_DAYS -- when user requested ## a password reset, for how many days is the URL valid? CFG_WEBSESSION_RESET_PASSWORD_EXPIRE_IN_DAYS = 3 ## CFG_WEBSESSION_ADDRESS_ACTIVATION_EXPIRE_IN_DAYS -- when an account ## activation email was sent, for how many days is the URL valid? CFG_WEBSESSION_ADDRESS_ACTIVATION_EXPIRE_IN_DAYS = 3 ## CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS -- when ## user won't confirm his email address and not complete ## registeration, after how many days will it expire? CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS = 10 ## CFG_WEBSESSION_DIFFERENTIATE_BETWEEN_GUESTS -- when set to 1, the session ## system allocates the same uid=0 to all guests users regardless of where they ## come from. 0 allocate a unique uid to each guest. CFG_WEBSESSION_DIFFERENTIATE_BETWEEN_GUESTS = 0 ## CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS -- to prevent session cookie ## stealing, Invenio checks that the IP address of a connection is the ## same as that of the connection which created the initial session. ## This variable let you decide how many bits should be skipped during ## this check. Set this to 0 in order to enable full IP address ## checking. Set this to 32 in order to disable IP address checking. ## Intermediate values (say 8) let you have some degree of security so ## that you can trust your local network only while helping to solve ## issues related to outside clients that configured their browser to ## use a web proxy for HTTP connection but not for HTTPS, thus ## potentially having two different IP addresses. In general, if use ## HTTPS in order to serve authenticated content, you can safely set ## CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS to 32. CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS = 0 ################################ ## Part 9: BibRank parameters ## ################################ ## This section contains some configuration parameters for the ranking ## system. ## CFG_BIBRANK_SHOW_READING_STATS -- do we want to show reading ## similarity stats? ('People who viewed this page also viewed') CFG_BIBRANK_SHOW_READING_STATS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_STATS -- do we want to show the download ## similarity stats? ('People who downloaded this document also ## downloaded') CFG_BIBRANK_SHOW_DOWNLOAD_STATS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS -- do we want to show download ## history graph? (0=no | 1=classic/gnuplot | 2=flot) CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION -- do we ## want to show a graph representing the distribution of client IPs ## downloading given document? (0=no | 1=classic/gnuplot | 2=flot) CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION = 0 ## CFG_BIBRANK_SHOW_CITATION_LINKS -- do we want to show the 'Cited ## by' links? (useful only when you have citations in the metadata) CFG_BIBRANK_SHOW_CITATION_LINKS = 1 ## CFG_BIBRANK_SHOW_CITATION_STATS -- de we want to show citation ## stats? ('Cited by M recors', 'Co-cited with N records') CFG_BIBRANK_SHOW_CITATION_STATS = 1 ## CFG_BIBRANK_SHOW_CITATION_GRAPHS -- do we want to show citation ## history graph? (0=no | 1=classic/gnuplot | 2=flot) CFG_BIBRANK_SHOW_CITATION_GRAPHS = 1 ## CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID -- use authorids for computing ## self-citations ## falls back to hashing the author string CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID = 0 ## CFG_BIBRANK_SELFCITES_PRECOMPUTE -- use precomputed self-citations ## when displaying itesummary. Precomputing citations allows use to ## speed up things CFG_BIBRANK_SELFCITES_PRECOMPUTE = 0 #################################### ## Part 10: WebComment parameters ## #################################### ## This section contains some configuration parameters for the ## commenting and reviewing facilities. ## CFG_WEBCOMMENT_ALLOW_COMMENTS -- do we want to allow users write ## public comments on records? CFG_WEBCOMMENT_ALLOW_COMMENTS = 1 ## CFG_WEBCOMMENT_ALLOW_REVIEWS -- do we want to allow users write ## public reviews of records? CFG_WEBCOMMENT_ALLOW_REVIEWS = 1 ## CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS -- do we want to allow short ## reviews, that is just the attribution of stars without submitting ## detailed review text? CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS = 0 ## CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN -- if users ## report a comment to be abusive, how many they have to be before the ## site admin is alerted? CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN = 5 ## CFG_WEBCOMMENT_NB_COMMENTS_IN_DETAILED_VIEW -- how many comments do ## we display in the detailed record page upon welcome? CFG_WEBCOMMENT_NB_COMMENTS_IN_DETAILED_VIEW = 1 ## CFG_WEBCOMMENT_NB_REVIEWS_IN_DETAILED_VIEW -- how many reviews do ## we display in the detailed record page upon welcome? CFG_WEBCOMMENT_NB_REVIEWS_IN_DETAILED_VIEW = 1 ## CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL -- do we notify the site ## admin after every comment? CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL = 1 ## CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS -- how many ## elapsed seconds do we consider enough when checking for possible ## multiple comment submissions by a user? CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS = 20 ## CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS -- how many ## elapsed seconds do we consider enough when checking for possible ## multiple review submissions by a user? CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS = 20 ## CFG_WEBCOMMENT_USE_RICH_EDITOR -- enable the WYSIWYG ## Javascript-based editor when user edits comments? CFG_WEBCOMMENT_USE_RICH_TEXT_EDITOR = False ## CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL -- the email address from which the ## alert emails will appear to be sent: CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = info@invenio-software.org ## CFG_WEBCOMMENT_DEFAULT_MODERATOR -- if no rules are ## specified to indicate who is the comment moderator of ## a collection, this person will be used as default CFG_WEBCOMMENT_DEFAULT_MODERATOR = info@invenio-software.org ## CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS -- do we want to allow the use ## of MathJax plugin to render latex input in comments? CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS = 1 ## CFG_WEBCOMMENT_AUTHOR_DELETE_COMMENT_OPTION -- allow comment author to ## delete its own comment? CFG_WEBCOMMENT_AUTHOR_DELETE_COMMENT_OPTION = 1 # CFG_WEBCOMMENT_EMAIL_REPLIES_TO -- which field of the record define # email addresses that should be notified of newly submitted comments, # and for which collection. Use collection names as keys, and list of # tags as values CFG_WEBCOMMENT_EMAIL_REPLIES_TO = { 'Articles': ['506__d', '506__m'], } # CFG_WEBCOMMENT_RESTRICTION_DATAFIELD -- which field of the record # define the restriction (must be linked to WebAccess # 'viewrestrcomment') to apply to newly submitted comments, and for # which collection. Use collection names as keys, and one tag as value CFG_WEBCOMMENT_RESTRICTION_DATAFIELD = { 'Articles': '5061_a', 'Pictures': '5061_a', 'Theses': '5061_a', } # CFG_WEBCOMMENT_ROUND_DATAFIELD -- which field of the record define # the current round of comment for which collection. Use collection # name as key, and one tag as value CFG_WEBCOMMENT_ROUND_DATAFIELD = { 'Articles': '562__c', 'Pictures': '562__c', } # CFG_WEBCOMMENT_MAX_ATTACHMENT_SIZE -- max file size per attached # file, in bytes. Choose 0 if you don't want to limit the size CFG_WEBCOMMENT_MAX_ATTACHMENT_SIZE = 5242880 # CFG_WEBCOMMENT_MAX_ATTACHED_FILES -- maxium number of files that can # be attached per comment. Choose 0 if you don't want to limit the # number of files. File uploads can be restricted with action # "attachcommentfile". CFG_WEBCOMMENT_MAX_ATTACHED_FILES = 5 # CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH -- how many levels of # indentation discussions can be. This can be used to ensure that # discussions will not go into deep levels of nesting if users don't # understand the difference between "reply to comment" and "add # comment". When the depth is reached, any "reply to comment" is # conceptually converted to a "reply to thread" (i.e. reply to this # parent's comment). Use -1 for no limit, 0 for unthreaded (flat) # discussions. CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH = 1 ################################## ## Part 11: BibSched parameters ## ################################## ## This section contains some configuration parameters for the ## bibliographic task scheduler. ## CFG_BIBSCHED_REFRESHTIME -- how often do we want to refresh ## bibsched monitor? (in seconds) CFG_BIBSCHED_REFRESHTIME = 5 ## CFG_BIBSCHED_LOG_PAGER -- what pager to use to view bibsched task ## logs? CFG_BIBSCHED_LOG_PAGER = /usr/bin/less ## CFG_BIBSCHED_EDITOR -- what editor to use to edit the marcxml ## code of the locked records CFG_BIBSCHED_EDITOR = /usr/bin/vim ## CFG_BIBSCHED_GC_TASKS_OLDER_THAN -- after how many days to perform the ## gargbage collector of BibSched queue (i.e. removing/moving task to archive). CFG_BIBSCHED_GC_TASKS_OLDER_THAN = 30 ## CFG_BIBSCHED_GC_TASKS_TO_REMOVE -- list of BibTask that can be safely ## removed from the BibSched queue once they are DONE. CFG_BIBSCHED_GC_TASKS_TO_REMOVE = bibindex,bibreformat,webcoll,bibrank,inveniogc ## CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE -- list of BibTasks that should be safely ## archived out of the BibSched queue once they are DONE. CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE = bibupload,oairepositoryupdater ## CFG_BIBSCHED_MAX_NUMBER_CONCURRENT_TASKS -- maximum number of BibTasks ## that can run concurrently. ## NOTE: concurrent tasks are still considered as an experimental ## feature. Please keep this value set to 1 on production environments. CFG_BIBSCHED_MAX_NUMBER_CONCURRENT_TASKS = 1 ## CFG_BIBSCHED_PROCESS_USER -- bibsched and bibtask processes must ## usually run under the same identity as the Apache web server ## process in order to share proper file read/write privileges. If ## you want to force some other bibsched/bibtask user, e.g. because ## you are using a local `invenio' user that belongs to your ## `www-data' Apache user group and so shares writing rights with your ## Apache web server process in this way, then please set its username ## identity here. Otherwise we shall check whether your ## bibsched/bibtask processes are run under the same identity as your ## Apache web server process (in which case you can leave the default ## empty value here). CFG_BIBSCHED_PROCESS_USER = ## CFG_BIBSCHED_NODE_TASKS -- specific nodes may be configured to ## run only specific tasks; if you want this, then this variable is a ## dictionary of the form {'hostname1': ['task1', 'task2']}. The ## default is that any node can run any task. CFG_BIBSCHED_NODE_TASKS = {} ## CFG_BIBSCHED_MAX_ARCHIVED_ROWS_DISPLAY -- number of tasks displayed ## in the archive view (by pressing '1') CFG_BIBSCHED_MAX_ARCHIVED_ROWS_DISPLAY = 500 ## CFG_BIBSCHED_NON_CONCURRENT_TASKS -- tasks that should not be running at ## the same time (but one of them can be sleeping) CFG_BIBSCHED_NON_CONCURRENT_TASKS = ( ('bibupload', ),) ## CFG_BIBSCHED_INCOMPATIBLE_TASKS -- tasks that should not be running at ## the same time (even if the other one is sleeping) CFG_BIBSCHED_INCOMPATIBLE_TASKS = () ## CFG_BIBSCHED_LOGDIR -- location of bibtask logs such as ## bibsched_task_{ID}.(log|err). Add a trailing slash to support symlinks. ## If path is relative, CFG_LOGDIR will be joined as a prefix. CFG_BIBSCHED_LOGDIR = bibsched ## CFG_BIBSCHED_FLUSH_LOGS -- flush logs after writing each message. ## This can be useful when using a filesystem that buffers yours writes ## like AFS and want to check the logs from a different server than the one ## the task is running on. CFG_BIBSCHED_FLUSH_LOGS = 0 ## CFG_BIBSCHED_NEVER_STOPS -- continue queue execution when a task fails CFG_BIBSCHED_NEVER_STOPS = 0 ################################### ## Part 12: WebBasket parameters ## ################################### ## CFG_WEBBASKET_MAX_NUMBER_OF_DISPLAYED_BASKETS -- a safety limit for ## a maximum number of displayed baskets CFG_WEBBASKET_MAX_NUMBER_OF_DISPLAYED_BASKETS = 20 ## CFG_WEBBASKET_USE_RICH_TEXT_EDITOR -- enable the WYSIWYG ## Javascript-based editor when user edits comments in WebBasket? CFG_WEBBASKET_USE_RICH_TEXT_EDITOR = False ################################## ## Part 13: WebAlert parameters ## ################################## ## This section contains some configuration parameters for the ## automatic email notification alert system. ## CFG_WEBALERT_ALERT_ENGINE_EMAIL -- the email address from which the ## alert emails will appear to be sent: CFG_WEBALERT_ALERT_ENGINE_EMAIL = info@invenio-software.org ## CFG_WEBALERT_MAX_NUM_OF_RECORDS_IN_ALERT_EMAIL -- how many records ## at most do we send in an outgoing alert email? CFG_WEBALERT_MAX_NUM_OF_RECORDS_IN_ALERT_EMAIL = 20 ## CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL -- number of ## chars per line in an outgoing alert email? CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL = 72 ## CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES -- when sending alert ## emails fails, how many times we retry? CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES = 3 ## CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES -- when sending ## alert emails fails, what is the sleeptime between tries? (in ## seconds) CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES = 300 #################################### ## Part 14: WebMessage parameters ## #################################### ## CFG_WEBMESSAGE_MAX_SIZE_OF_MESSAGE -- how large web messages do we ## allow? CFG_WEBMESSAGE_MAX_SIZE_OF_MESSAGE = 20000 ## CFG_WEBMESSAGE_MAX_NB_OF_MESSAGES -- how many messages for a ## regular user do we allow in its inbox? CFG_WEBMESSAGE_MAX_NB_OF_MESSAGES = 30 ## CFG_WEBMESSAGE_DAYS_BEFORE_DELETE_ORPHANS -- how many days before ## we delete orphaned messages? CFG_WEBMESSAGE_DAYS_BEFORE_DELETE_ORPHANS = 60 ################################## ## Part 15: MiscUtil parameters ## ################################## ## CFG_MISCUTIL_SQL_USE_SQLALCHEMY -- whether to use SQLAlchemy.pool ## in the DB engine of Invenio. It is okay to enable this flag ## even if you have not installed SQLAlchemy. Note that Invenio will ## loose some perfomance if this option is enabled. CFG_MISCUTIL_SQL_USE_SQLALCHEMY = False ## CFG_MISCUTIL_SQL_RUN_SQL_MANY_LIMIT -- how many queries can we run ## inside run_sql_many() in one SQL statement? The limit value ## depends on MySQL's max_allowed_packet configuration. CFG_MISCUTIL_SQL_RUN_SQL_MANY_LIMIT = 10000 ## CFG_MISCUTIL_SMTP_HOST -- which server to use as outgoing mail server to ## send outgoing emails generated by the system, for example concerning ## submissions or email notification alerts. CFG_MISCUTIL_SMTP_HOST = localhost ## CFG_MISCUTIL_SMTP_PORT -- which port to use on the outgoing mail server ## defined in the previous step. CFG_MISCUTIL_SMTP_PORT = 25 ## CFG_MISCUTIL_SMTP_USER -- which username to use on the outgoing mail server ## defined in CFG_MISCUTIL_SMTP_HOST. If either CFG_MISCUTIL_SMTP_USER or ## CFG_MISCUTIL_SMTP_PASS are empty Invenio won't attempt authentication. CFG_MISCUTIL_SMTP_USER = ## CFG_MISCUTIL_SMTP_PASS -- which password to use on the outgoing mail ## server defined in CFG_MISCUTIL_SMTP_HOST. If either CFG_MISCUTIL_SMTP_USER ## or CFG_MISCUTIL_SMTP_PASS are empty Invenio won't attempt authentication. CFG_MISCUTIL_SMTP_PASS = ## CFG_MISCUTIL_SMTP_TLS -- whether to use a TLS (secure) connection when ## talking to the SMTP server defined in CFG_MISCUTIL_SMTP_HOST. CFG_MISCUTIL_SMTP_TLS = False ## CFG_MISCUTILS_DEFAULT_PROCESS_TIMEOUT -- the default number of seconds after ## which a process launched trough shellutils.run_process_with_timeout will ## be killed. This is useful to catch runaway processes. CFG_MISCUTIL_DEFAULT_PROCESS_TIMEOUT = 300 ## CFG_MATHJAX_HOSTING -- if you plan to use MathJax to display TeX ## formulas on HTML web pages, you can specify whether you wish to use ## 'local' hosting or 'cdn' hosting of MathJax libraries. (If set to ## 'local', you have to run 'make install-mathjax-plugin' as described ## in the INSTALL guide.) If set to 'local', users will use your site ## to download MathJax sources. If set to 'cdn', users will use ## centralized MathJax CDN servers instead. Please note that using ## CDN is suitable only for small institutes or for MathJax ## sponsors; see the MathJax website for more details. (Also, please ## note that if you plan to use MathJax on your site, you have to ## adapt CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS and ## CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS configuration variables ## elsewhere in this file.) CFG_MATHJAX_HOSTING = local ################################# ## Part 16: BibEdit parameters ## ################################# ## CFG_BIBEDIT_TIMEOUT -- when a user edits a record, this record is ## locked to prevent other users to edit it at the same time. ## How many seconds of inactivity before the locked record again will be free ## for other people to edit? CFG_BIBEDIT_TIMEOUT = 3600 ## CFG_BIBEDIT_LOCKLEVEL -- when a user tries to edit a record which there ## is a pending bibupload task for in the queue, this shouldn't be permitted. ## The lock level determines how thouroughly the queue should be investigated ## to determine if this is the case. ## Level 0 - always permits editing, doesn't look at the queue ## (unsafe, use only if you know what you are doing) ## Level 1 - permits editing if there are no queued bibedit tasks for this record ## (safe with respect to bibedit, but not for other bibupload maintenance jobs) ## Level 2 - permits editing if there are no queued bibupload tasks of any sort ## (safe, but may lock more than necessary if many cataloguers around) ## Level 3 - permits editing if no queued bibupload task concerns given record ## (safe, most precise locking, but slow, ## checks for 001/EXTERNAL_SYSNO_TAG/EXTERNAL_OAIID_TAG) ## The recommended level is 3 (default) or 2 (if you use maintenance jobs often). CFG_BIBEDIT_LOCKLEVEL = 3 ## CFG_BIBEDIT_PROTECTED_FIELDS -- a comma-separated list of fields that BibEdit ## will not allow to be added, edited or deleted. Wildcards are not supported, ## but conceptually a wildcard is added at the end of every field specification. ## Examples: ## 500A - protect all MARC fields with tag 500 and first indicator A ## 5 - protect all MARC fields in the 500-series. ## 909C_a - protect subfield a in tag 909 with first indicator C and empty ## second indicator ## Note that 001 is protected by default, but if protection of other ## identifiers or automated fields is a requirement, they should be added to ## this list. CFG_BIBEDIT_PROTECTED_FIELDS = ## CFG_BIBEDIT_QUEUE_CHECK_METHOD -- how do we want to check for ## possible queue locking situations to prevent cataloguers from ## editing a record that may be waiting in the queue? Use 'bibrecord' ## for exact checking (always works, but may be slow), use 'regexp' ## for regular expression based checking (very fast, but may be ## inaccurate). When unsure, use 'bibrecord'. CFG_BIBEDIT_QUEUE_CHECK_METHOD = bibrecord ## CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE -- a list ## containing which collections will be extended with a given template ## while being displayed in BibEdit UI. The collection corresponds with ## the value written in field 980. The order is not arbitrary, first match of ## a collection in CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE with a ## 980 field will be used. CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE = [('POETRY', 'record_poem')] ## CFG_BIBEDIT_KB_SUBJECTS - Name of the KB used in the field 65017a ## to automatically convert codes into extended version. e.g ## a - Astrophysics CFG_BIBEDIT_KB_SUBJECTS = Subjects ## CFG_BIBEDIT_KB_INSTITUTIONS - Name of the KB used for institution ## autocomplete. To be applied in fields defined in ## CFG_BIBEDIT_AUTOCOMPLETE_INSTITUTIONS_FIELDS CFG_BIBEDIT_KB_INSTITUTIONS = InstitutionsCollection ## CFG_BIBEDIT_AUTOCOMPLETE_INSTITUTIONS_FIELDS - list of fields to ## be autocompleted with the KB CFG_BIBEDIT_KB_INSTITUTIONS CFG_BIBEDIT_AUTOCOMPLETE_INSTITUTIONS_FIELDS = 100__u,700__u,701__u,502__c ## CFG_BIBEDIT_SHOW_HOLDING_PEN_REMOVED_FIELDS -- whether to show or not ## the fields and subfields that are removed in holding pen records. ## 0 for false, 1 for true. CFG_BIBEDIT_SHOW_HOLDING_PEN_REMOVED_FIELDS = 0 ## CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING -- maximum number of records ## that can be modified instantly using the multi-record editor. Above ## this limit, modifications will only be executed in limited hours. CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING = 2000 ## CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING -- maximum number of records ## that can be send for modification without having a superadmin role. ## If the number of records is between CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING ## and this number, the modifications will take place only in limited hours. CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING = 20000 ## CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING_TIME -- Allowed time to ## execute modifications on records, when the number exceeds ## CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING. CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING_TIME = 22:00-05:00 ## CFG_BIBEDIT_ADD_TICKET_RT_QUEUES -- list of queues that are ## are being displayed in create new ticket select box, so ## that user can create new ticket in one of these queues. CFG_BIBEDIT_ADD_TICKET_RT_QUEUES=Authors,Conf_add+cor,Exp,Feedback,HEP_cor,HEP_ref,INST_add+cor,AUTHORS_long_list ################################### ## Part 17: BibUpload parameters ## ################################### ## CFG_BIBUPLOAD_REFERENCE_TAG -- where do we store references? CFG_BIBUPLOAD_REFERENCE_TAG = 999 ## CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG -- where do we store external ## system numbers? Useful for matching when our records come from an ## external digital library system. CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG = 970__a ## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG -- where do we store OAI ID tags ## of harvested records? Useful for matching when we harvest stuff ## via OAI that we do not want to reexport via Invenio OAI; so records ## may have only the source OAI ID stored in this tag (kind of like ## external system number too). CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG = 035__a ## CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG -- where do we store OAI SRC ## tags of harvested records? Useful for matching when we harvest stuff ## via OAI that we do not want to reexport via Invenio OAI; so records ## may have only the source OAI SRC stored in this tag (kind of like ## external system number too). Note that the field should be the same of ## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG = 035__9 ## CFG_BIBUPLOAD_STRONG_TAGS -- a comma-separated list of tags that ## are strong enough to resist the replace mode. Useful for tags that ## might be created from an external non-metadata-like source, ## e.g. the information about the number of copies left. CFG_BIBUPLOAD_STRONG_TAGS = 964 ## CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS -- a comma-separated list ## of tags that contain provenance information that should be checked ## in the bibupload correct mode via matching provenance codes. (Only ## field instances of the same provenance information would be acted ## upon.) Please specify the whole tag info up to subfield codes. CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS = 6531_9 ## CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS -- a comma-separated list of system ## paths from which it is allowed to take fulltextes that will be uploaded via ## FFT (CFG_TMPDIR is included by default). CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS = /tmp,/home ## CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS -- a dictionary containing ## external URLs that can be accessed by Invenio and specific HTTP ## headers that will be used for each URL. The keys of the dictionary ## are regular expressions matching a set of URLs, the values are ## dictionaries of headers as consumed by urllib2.Request. If a ## regular expression matching all URLs is created at the end of the ## list, it means that Invenio will download all URLs. Otherwise ## Invenio will just download authorized URLs. Note: by default, a ## User-Agent built after the current Invenio version, site name, and ## site URL will be used. The values of the header dictionary can ## also contain a call to a python function, in the form of a ## disctionary with two entries: the name of the function to be called ## as a value for the 'fnc' key, and the arguments to this function, ## as a value for the 'args' key (in the form of a dictionary). ## CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [ ## ('http://myurl.com/.*', {'User-Agent': 'Me'}), ## ('http://yoururl.com/.*', {'User-Agent': 'You', 'Accept': 'text/plain'}), ## ('http://thisurl.com/.*', {'Cookie': {'fnc':'read_cookie', 'args':{'cookiefile':'/tmp/cookies.txt'}}}) ## ('http://.*', {'User-Agent': 'Invenio'}), ## ] CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [ ('http(s)?://.*', {}), ] ## CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE -- do we want to serialize ## internal representation of records (Pythonic record structure) into ## the database? This can improve internal processing speed of some ## operations at the price of somewhat bigger disk space usage. ## If you change this value after some records have already been added ## to your installation, you may want to run: ## $ /opt/invenio/bin/inveniocfg --reset-recstruct-cache ## in order to either erase the cache thus freeing database space, ## or to fill the cache for all records that have not been cached yet. CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE = 1 ## CFG_BIBUPLOAD_DELETE_FORMATS -- which formats do we want bibupload ## to delete when a record is ingested? Enter comma-separated list of ## formats. For example, 'hb,hd' will delete pre-formatted HTML brief ## and defailed formats from cache, so that search engine will ## generate them on-the-fly. Useful to always present latest data of ## records upon record display, until the periodical bibreformat job ## runs next and updates the cache. CFG_BIBUPLOAD_DELETE_FORMATS = hb ## CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS -- set to 1 if keeping ## history of record revisions is not necessary (e.g. because records ## and corresponding modifications are coming always from the same ## external system which already keeps revision history). CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS = 0 ## CFG_BIBUPLOAD_CONFLICTING_REVISION_TICKET_QUEUE -- Set the name of ## the BibCatalog ticket queue to be used when BibUpload can't ## automatically resolve a revision conflict and has therefore to put ## requested modifications in the holding pen. CFG_BIBUPLOAD_CONFLICTING_REVISION_TICKET_QUEUE = +## CFG_BIBUPLOAD_MATCH_DELETED_RECORDS -- Whether matching of +## records (e.g. via DOI and other identifiers) should consider +## also deleted records. +CFG_BIBUPLOAD_MATCH_DELETED_RECORDS = 1 + ## CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY -- a comma-separated list ## indicating which fields match the file names of the documents to be ## uploaded. ## The matching will be done in the same order as the list provided. CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY = reportnumber,recid ## CFG_BATCHUPLOADER_DAEMON_DIR -- Directory where the batchuploader daemon ## will look for the subfolders metadata and document by default. ## If path is relative, CFG_PREFIX will be joined as a prefix CFG_BATCHUPLOADER_DAEMON_DIR = var/batchupload ## CFG_BATCHUPLOADER_WEB_ROBOT_AGENTS -- Regular expression to specify the ## agents permitted when calling batch uploader web interface ## cds.cern.ch/batchuploader/robotupload ## if using a curl, eg: curl xxx -A invenio CFG_BATCHUPLOADER_WEB_ROBOT_AGENTS = invenio_webupload|Invenio-.* ## CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS -- Access list specifying for each ## IP address (or range), which collections are allowed using batch uploader robot ## interface. ## Note: you can also specify network ranges such as "127.0.0.0/8". CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS = { '127.0.0.1': ['*'], # useful for testing '127.0.1.1': ['*'], # useful for testing '10.0.0.1': ['BOOK', 'REPORT'], # Example 1 '10.0.0.2': ['POETRY', 'PREPRINT'], # Example 2 } #################################### ## Part 18: BibCatalog parameters ## #################################### ## CFG_BIBCATALOG_SYSTEM -- set desired catalog system. (RT or EMAIL) CFG_BIBCATALOG_SYSTEM = EMAIL ## Email backend configuration: CFG_BIBCATALOG_SYSTEM_EMAIL_ADDRESS = info@invenio-software.org ## RT backend configuration: ## CFG_BIBCATALOG_SYSTEM_RT_CLI -- path to the RT CLI client CFG_BIBCATALOG_SYSTEM_RT_CLI = /usr/bin/rt ## CFG_BIBCATALOG_SYSTEM_RT_URL -- Base URL of the remote RT system CFG_BIBCATALOG_SYSTEM_RT_URL = http://localhost/rt3 ## CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER -- Set the username for a default RT account ## on remote system, with limited privileges, in order to only create and modify own tickets. CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER = ## CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD -- Set the password for the default RT account ## on remote system. CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD = #################################### ## Part 19: BibFormat parameters ## #################################### ## CFG_BIBFORMAT_HIDDEN_TAGS -- comma-separated list of MARC tags that ## are not shown to users not having cataloging authorizations. CFG_BIBFORMAT_HIDDEN_TAGS = 595 ## CFG_BIBFORMAT_HIDDEN_FILE_FORMATS -- comma-separated list of file formats ## that are not shown explicitly to user not having cataloging authorizations. ## e.g. pdf;pdfa,xml CFG_BIBFORMAT_HIDDEN_FILE_FORMATS = ## CFG_BIBFORMAT_ADDTHIS_ID -- if you want to use the AddThis service from ## , set this value to the pubid parameter as ## provided by the service (e.g. ra-4ff80aae118f4dad), and add a call to ## formatting element in your formats, for example ## Default_HTML_detailed.bft. CFG_BIBFORMAT_ADDTHIS_ID = ## CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS -- For each output ## format BibReformat currently creates a cache for only one language ## (CFG_SITE_LANG) per record. This means that visitors having set a ## different language than CFG_SITE_LANG will be served an on-the-fly ## output using the language of their choice. You can disable this ## behaviour by specifying below for which output format you would ## like to force the cache to be used whatever language is ## requested. If your format templates do not provide ## internationalization, you can optimize your site by setting for ## eg. hb,hd to always serve the precached output (if it exists) in ## the CFG_SITE_LANG CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS = ## CFG_BIBFORMAT_CACHED_FORMATS -- Specify a list of cached formats ## We need to know which ones are cached because bibformat will save the ## of these in a db table CFG_BIBFORMAT_CACHED_FORMATS = #################################### ## Part 20: BibMatch parameters ## #################################### ## CFG_BIBMATCH_LOCAL_SLEEPTIME -- Determines the amount of seconds to sleep ## between search queries on LOCAL system. CFG_BIBMATCH_LOCAL_SLEEPTIME = 0.0 ## CFG_BIBMATCH_REMOTE_SLEEPTIME -- Determines the amount of seconds to sleep ## between search queries on REMOTE systems. CFG_BIBMATCH_REMOTE_SLEEPTIME = 2.0 ## CFG_BIBMATCH_FUZZY_WORDLIMITS -- Determines the amount of words to extract ## from a certain fields value during fuzzy matching mode. Add/change field ## and appropriate number to the dictionary to configure this. CFG_BIBMATCH_FUZZY_WORDLIMITS = { '100__a': 2, '245__a': 4 } ## CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT -- Determines the amount of empty results ## to accept during fuzzy matching mode. CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT = 1 ## CFG_BIBMATCH_QUERY_TEMPLATES -- Here you can set the various predefined querystrings ## used to standardize common matching queries. By default the following templates ## are given: ## title - standard title search. Taken from 245__a (default) ## title-author - title and author search (i.e. this is a title AND author a) ## Taken from 245__a and 100__a ## reportnumber - reportnumber search (i.e. reportnumber:REP-NO-123). CFG_BIBMATCH_QUERY_TEMPLATES = { 'title' : '[title]', 'title-author' : '[title] [author]', 'reportnumber' : 'reportnumber:[reportnumber]' } ## CFG_BIBMATCH_MATCH_VALIDATION_RULESETS -- Here you can define the various rulesets for ## validating search results done by BibMatch. Each ruleset contains a certain pattern mapped ## to a tuple defining a "matching-strategy". ## ## The rule-definitions must come in two parts: ## ## * The first part is a string containing a regular expression ## that is matched against the textmarc representation of each record. ## If a match is found, the final rule-set is updated with ## the given "sub rule-set", where identical tag rules are replaced. ## ## * The second item is a list of key->value mappings (dict) that indicates specific ## strategy parameters with corresponding validation rules. ## ## This strategy consists of five items: ## ## * MARC TAGS: ## These MARC tags represents the fields taken from original record and any records from search ## results. When several MARC tags are specified with a given match-strategy, all the fields ## associated with these tags are matched together (i.e. with key "100__a,700__a", all 100__a ## and 700__a fields are matched together. Which is useful when first-author can vary for ## certain records on different systems). ## ## * COMPARISON THRESHOLD: ## a value between 0.0 and 1.0 specifying the threshold for string matches ## to determine if it is a match or not (using normalized string-distance). ## Normally 0.8 (80% match) is considered to be a close match. ## ## * COMPARISON MODE: ## the parse mode decides how the record datafields are compared: ## - 'strict' : all (sub-)fields are compared, and all must match. Order is significant. ## - 'normal' : all (sub-)fields are compared, and all must match. Order is ignored. ## - 'lazy' : all (sub-)fields are compared with each other and at least one must match ## - 'ignored': the tag is ignored in the match. Used to disable previously defined rules. ## ## * MATCHING MODE: ## the comparison mode decides how the fieldvalues are matched: ## - 'title' : uses a method specialized for comparing titles, e.g. looking for subtitles ## - 'author' : uses a special authorname comparison. Will take initials into account. ## - 'identifier' : special matching for identifiers, stripping away punctuation ## - 'date': matches dates by extracting and comparing the year ## - 'normal': normal string comparison. ## Note: Fields are considered matching when all its subfields or values match. ## ## * RESULT MODE: ## the result mode decides how the results from the comparisons are handled further: ## - 'normal' : a failed match will cause the validation to immediately exit as a failure. ## a successful match will cause the validation to continue on other rules (if any) ## - 'final' : a failed match will cause the validation to immediately exit as a failure. ## a successful match will cause validation to immediately exit as a success. ## - 'joker' : a failed match will cause the validation to continue on other rules (if any). ## a successful match will cause validation to immediately exit as a success. ## ## You can add your own rulesets in the dictionary below. The 'default' ruleset is always applied, ## and should therefore NOT be removed, but can be changed. The tag-rules can also be overwritten ## by other rulesets. ## ## WARNING: Beware that the validation quality is only as good as given rules, so matching results ## are never guaranteed to be accurate, as it is very content-specific. CFG_BIBMATCH_MATCH_VALIDATION_RULESETS = [('default', [{ 'tags' : '245__%,242__%', 'threshold' : 0.8, 'compare_mode' : 'lazy', 'match_mode' : 'title', 'result_mode' : 'normal' }, { 'tags' : '037__a,088__a', 'threshold' : 1.0, 'compare_mode' : 'lazy', 'match_mode' : 'identifier', 'result_mode' : 'final' }, { 'tags' : '100__a,700__a', 'threshold' : 0.8, 'compare_mode' : 'normal', 'match_mode' : 'author', 'result_mode' : 'normal' }, { 'tags' : '773__a', 'threshold' : 1.0, 'compare_mode' : 'lazy', 'match_mode' : 'title', 'result_mode' : 'normal' }]), ('980__ \$\$a(THESIS|Thesis)', [{ 'tags' : '100__a', 'threshold' : 0.8, 'compare_mode' : 'strict', 'match_mode' : 'author', 'result_mode' : 'normal' }, { 'tags' : '700__a,701__a', 'threshold' : 1.0, 'compare_mode' : 'lazy', 'match_mode' : 'author', 'result_mode' : 'normal' }, { 'tags' : '100__a,700__a', 'threshold' : 0.8, 'compare_mode' : 'ignored', 'match_mode' : 'author', 'result_mode' : 'normal' }]), ('260__', [{ 'tags' : '260__c', 'threshold' : 0.8, 'compare_mode' : 'lazy', 'match_mode' : 'date', 'result_mode' : 'normal' }]), ('0247_', [{ 'tags' : '0247_a', 'threshold' : 1.0, 'compare_mode' : 'lazy', 'match_mode' : 'identifier', 'result_mode' : 'final' }]), ('020__', [{ 'tags' : '020__a', 'threshold' : 1.0, 'compare_mode' : 'lazy', 'match_mode' : 'identifier', 'result_mode' : 'final' }]) ] ## CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT -- Determines the minimum percentage of the ## amount of rules to be positively matched when comparing two records. Should the number ## of matches be lower than required matches but equal to or above this limit, ## the match will be considered fuzzy. CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT = 0.65 ## CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT -- Determines the maximum amount of search results ## a single search can return before acting as a non-match. CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT = 15 ## CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS -- Determines the minimum amount of comparisons ## required for a validation process to be fully "valid". Meaning, for the default value, that ## if for example at least 2 comparisons (rules) are made, regardless of success/failure, the validation ## may or may not return as a success. Should fewer comparisons have taken place, the validation ## returns as a failure. CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS = 2 ###################################### ## Part 21: BibAuthorID parameters ## ###################################### # CFG_BIBAUTHORID_MAX_PROCESSES is the max number of processes # that may be spawned by the disambiguation algorithm CFG_BIBAUTHORID_MAX_PROCESSES = 12 # CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS is the max number of threads # to parallelize sql queries during personID tables updates CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS = 12 # CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY defines the user info # keys for externally claimed records in an remote-login scenario--e.g. from arXiv.org # e.g. "external_arxivids" for arXiv SSO CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY = # CFG_BIBAUTHORID_AID_ENABLED # Globally enable AuthorID Interfaces. # If False: No guest, user or operator will have access to the system. CFG_BIBAUTHORID_ENABLED = True # CFG_BIBAUTHORID_AID_ON_AUTHORPAGES # Enable AuthorID information on the author pages. CFG_BIBAUTHORID_ON_AUTHORPAGES = True # CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL defines the eMail address # all ticket requests concerning authors will be sent to. CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL = info@invenio-software.org # CFG_BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE defines if the optional arXive stub page is skipped CFG_BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE = False # CFG_BIBAUTHORID_DB_MAX_DATACHUNK_PER_INSERT_QUERY defines the maximum datachunk size to # be inserted to the bibauthorid search engine tables per single insert query # (used when the search engine indexer is updating and populating the tables). # Bigger size means better performance. Nevertheless take into account # that if it is too big there is the risk that the mysql timeout will run out # and connection with the db will be lost. CFG_BIBAUTHORID_SEARCH_ENGINE_MAX_DATACHUNK_PER_INSERT_DB_QUERY = 10000000 # CFG_BIBAUTHORID_ENABLED_REMOTE_LOGIN_SYSTEMS defines the available remote login external # systems that a user can log in through them on Inspire CFG_BIBAUTHORID_ENABLED_REMOTE_LOGIN_SYSTEMS = arXiv ######################################### ## Part 22: BibCirculation parameters ## ######################################### ## CFG_BIBCIRCULATION_ITEM_STATUS_OPTIONAL -- comma-separated list of statuses # Example: missing, order delayed, not published # You can allways add a new status here, but you may want to run some script # to update the database if you remove some statuses. CFG_BIBCIRCULATION_ITEM_STATUS_OPTIONAL = ## Here you can edit the text of the statuses that have specific roles. # You should run a script to update the database if you change them after having # used the module for some time. ## Item statuses # The book is on loan CFG_BIBCIRCULATION_ITEM_STATUS_ON_LOAN = on loan # Available for loan CFG_BIBCIRCULATION_ITEM_STATUS_ON_SHELF = on shelf # The book is being processed by the library (cataloguing, etc.) CFG_BIBCIRCULATION_ITEM_STATUS_IN_PROCESS = in process # The book has been ordered (bought) CFG_BIBCIRCULATION_ITEM_STATUS_ON_ORDER = on order # The order of the book has been cancelled CFG_BIBCIRCULATION_ITEM_STATUS_CANCELLED = cancelled # The order of the book has not arrived yet CFG_BIBCIRCULATION_ITEM_STATUS_NOT_ARRIVED = not arrived # The order of the book has not arrived yet and has been claimed CFG_BIBCIRCULATION_ITEM_STATUS_CLAIMED = claimed # The book has been proposed for acquisition and is under review. CFG_BIBCIRCULATION_ITEM_STATUS_UNDER_REVIEW = under review ## Loan statuses # This status should not be confussed with CFG_BIBCIRCULATION_ITEM_STATUS_ON_LOAN. # If the item status is CFG_BIBCIRCULATION_ITEM_STATUS_ON_LOAN, then there is # a loan with status CFG_BIBCIRCULATION_LOAN_STATUS_ON_LOAN or # CFG_BIBCIRCULATION_LOAN_STATUS_EXPIRED. # For each copy, there can only be one active loan ('on loan' or 'expired') at # the time, since can be many 'returned' loans for the same copy. CFG_BIBCIRCULATION_LOAN_STATUS_ON_LOAN = on loan # The due date has come and the item has not been returned CFG_BIBCIRCULATION_LOAN_STATUS_EXPIRED = expired # The item has been returned. CFG_BIBCIRCULATION_LOAN_STATUS_RETURNED = returned ## Request statuses # There is at least one copy available, and this is the oldest request. CFG_BIBCIRCULATION_REQUEST_STATUS_PENDING = pending # There are no copies available, or there is another request with more priority. CFG_BIBCIRCULATION_REQUEST_STATUS_WAITING = waiting # The request has become a loan CFG_BIBCIRCULATION_REQUEST_STATUS_DONE = done # The request has been cancelled CFG_BIBCIRCULATION_REQUEST_STATUS_CANCELLED = cancelled # The request has been generated for a proposed book CFG_BIBCIRCULATION_REQUEST_STATUS_PROPOSED = proposed # ILL request statuses CFG_BIBCIRCULATION_ILL_STATUS_NEW = new CFG_BIBCIRCULATION_ILL_STATUS_REQUESTED = requested CFG_BIBCIRCULATION_ILL_STATUS_ON_LOAN = on loan CFG_BIBCIRCULATION_ILL_STATUS_RETURNED = returned CFG_BIBCIRCULATION_ILL_STATUS_CANCELLED = cancelled CFG_BIBCIRCULATION_ILL_STATUS_RECEIVED = received #Book proposal statuses CFG_BIBCIRCULATION_PROPOSAL_STATUS_NEW = proposal-new CFG_BIBCIRCULATION_PROPOSAL_STATUS_ON_ORDER = proposal-on order CFG_BIBCIRCULATION_PROPOSAL_STATUS_PUT_ASIDE = proposal-put aside CFG_BIBCIRCULATION_PROPOSAL_STATUS_RECEIVED = proposal-received # Purchase statuses CFG_BIBCIRCULATION_ACQ_STATUS_NEW = new CFG_BIBCIRCULATION_ACQ_STATUS_ON_ORDER = on order CFG_BIBCIRCULATION_ACQ_STATUS_PARTIAL_RECEIPT = partial receipt CFG_BIBCIRCULATION_ACQ_STATUS_RECEIVED = received CFG_BIBCIRCULATION_ACQ_STATUS_CANCELLED = cancelled ## Library types # Normal library where you have your books. I can also be a depot. CFG_BIBCIRCULATION_LIBRARY_TYPE_INTERNAL = internal # external libraries for ILL. CFG_BIBCIRCULATION_LIBRARY_TYPE_EXTERNAL = external # The main library is also an internal library. # Since you may have several depots or small sites you can tag one of them as # the main site. CFG_BIBCIRCULATION_LIBRARY_TYPE_MAIN = main # It is also an internal library. The copies in this type of library will NOT # be displayed to borrowers. Use this for depots. CFG_BIBCIRCULATION_LIBRARY_TYPE_HIDDEN = hidden ## Amazon access key. You will need your own key. # Example: 1T6P5M3ZDMW9AWJ212R2 CFG_BIBCIRCULATION_AMAZON_ACCESS_KEY = ###################################### ## Part 22: BibClassify parameters ## ###################################### # CFG_BIBCLASSIFY_WEB_MAXKW -- maximum number of keywords to display # in the Keywords tab web page. CFG_BIBCLASSIFY_WEB_MAXKW = 100 ######################################## ## Part 23: Plotextractor parameters ## ######################################## ## CFG_PLOTEXTRACTOR_SOURCE_BASE_URL -- for acquiring source tarballs for plot ## extraction, where should we look? If nothing is set, we'll just go ## to arXiv, but this can be a filesystem location, too CFG_PLOTEXTRACTOR_SOURCE_BASE_URL = http://arxiv.org/ ## CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER -- for acquiring source tarballs for plot ## extraction, subfolder where the tarballs sit CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER = e-print/ ## CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER -- for acquiring source tarballs for plot ## extraction, subfolder where the pdf sit CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER = pdf/ ## CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT -- a float representing the number of seconds ## to wait between each download of pdf and/or tarball from source URL. CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT = 2.0 ## CFG_PLOTEXTRACTOR_CONTEXT_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of characters in each direction to extract ## context from. Default 750. CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT = 750 ## CFG_PLOTEXTRACTOR_DISALLOWED_TEX -- when extracting context of plots from TeX ## sources, this is the list of TeX tags that will trigger 'end of context'. CFG_PLOTEXTRACTOR_DISALLOWED_TEX = begin,end,section,includegraphics,caption,acknowledgements ## CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of words in each direction. Default 75. CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT = 75 ## CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of sentences in each direction. Default 2. CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT = 2 ###################################### ## Part 24: WebStat parameters ## ###################################### # CFG_WEBSTAT_BIBCIRCULATION_START_YEAR defines the start date of the BibCirculation # statistics. Value should have the format 'yyyy'. If empty, take all existing data. CFG_WEBSTAT_BIBCIRCULATION_START_YEAR = ###################################### ## Part 25: Web API Key parameters ## ###################################### # CFG_WEB_API_KEY_ALLOWED_URL defines the web apps that are going to use the web # API key. It has three values, the name of the web app, the time of life for the # secure url and if a time stamp is needed. #CFG_WEB_API_KEY_ALLOWED_URL = [('search/\?', 3600, True), # ('rss', 0, False)] CFG_WEB_API_KEY_ALLOWED_URL = [] ########################################## ## Part 26: WebAuthorProfile parameters ## ########################################## # CFG_WEBAUTHORPROFILE_CACHE_EXPIRED_DELAY_LIVE consider a cached element expired after days # when loading an authorpage, thus recomputing the content live CFG_WEBAUTHORPROFILE_CACHE_EXPIRED_DELAY_LIVE = 7 # CFG_WEBAUTHORPROFILE_CACHE_EXPIRED_DELAY_BIBSCHED consider a cache element expired after days, # thus recompute it, bibsched daemon CFG_WEBAUTHORPROFILE_CACHE_EXPIRED_DELAY_BIBSCHED = 5 # CFG_WEBAUTHORPROFILE_MAX_COLLAB_LIST: limit collaboration list. # Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_COLLAB_LIST = 100 # CFG_WEBAUTHORPROFILE_MAX_KEYWORD_LIST: limit keywords list # Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_KEYWORD_LIST = 100 # CFG_WEBAUTHORPROFILE_MAX_FIELDCODE_LIST: limit frequent keyword list # Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_FIELDCODE_LIST = 100 # CFG_WEBAUTHORPROFILE_MAX_AFF_LIST: limit affiliations list # Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_AFF_LIST = 100 # CFG_WEBAUTHORPROFILE_MAX_COAUTHOR_LIST: limit coauthors list # Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_COAUTHOR_LIST = 100 # CFG_WEBAUTHORPROFILE_MAX_HEP_CHOICES: limit HepRecords choices # Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_HEP_CHOICES = 10 # CFG_WEBAUTHORPROFILE_USE_BIBAUTHORID: use bibauthorid or exactauthor CFG_WEBAUTHORPROFILE_USE_BIBAUTHORID = False # CFG_WEBAUTHORPROFILE_USE_ALLOWED_FIELDCODES: use allowed fieldcodes CFG_WEBAUTHORPROFILE_USE_ALLOWED_FIELDCODES = True # CFG_WEBAUTHORPROFILE_ALLOWED_FIELDCODES: allowed fieldcodes CFG_WEBAUTHORPROFILE_ALLOWED_FIELDCODES = ['Astrophysics', 'Accelerators', 'Computing', \ 'Experiment-HEP', 'Gravitation and Cosmology', 'Instrumentation', 'Lattice', \ 'Math and Math Physics', 'Theory-Nucl', 'Other', 'Phenomenology-HEP', 'General Physics', \ 'Theory-HEP', 'Experiment-Nucl'] CFG_WEBAUTHORPROFILE_CFG_HEPNAMES_EMAIL = authors@inspirehep.net #CFG_WEBAUTHORPROFILE_SERIALIZER = [msgpack|pickle] #Select which serializer is used to store caches in the database CFG_WEBAUTHORPROFILE_SERIALIZER = msgpack # CFG_WEBAUTHORPROFILE_ORCID_ENDPOINT_* # Configure the endpoint server used to connect to ORCID CFG_WEBAUTHORPROFILE_ORCID_ENDPOINT_PUBLIC = http://pub.orcid.org/ # http://sandbox-1.orcid.org/ CFG_WEBAUTHORPROFILE_ORCID_ENDPOINT_MEMBER = http://api.orcid.org/ # http://api.sandbox-1.orcid.org/ #################################### #################################### ## Part 27: BibSort parameters ## #################################### ## CFG_BIBSORT_ENABLED -- whetever to enable or disable record sorting ## by bibsort. CFG_BIBSORT_ENABLED = 1 ## CFG_BIBSORT_BUCKETS -- the number of buckets bibsort should use. ## If 0, then no buckets will be used (bibsort will be inactive). ## If different from 0, bibsort will be used for sorting the records. ## The number of buckets should be set with regards to the size ## of the repository; having a larger number of buckets will increase ## the sorting performance for the top results but will decrease ## the performance for sorting the middle results. ## We recommend to to use 1 in case you have less than about ## 1,000,000 records. ## When modifying this variable, re-run rebalancing for all the bibsort ## methods, for having the database in synch. CFG_BIBSORT_BUCKETS = 1 ######################################## ## Part 28: JsTestDriver parameters ## ######################################## ## CFG_JSTESTDRIVER_PORT -- server port where JS tests will be run. CFG_JSTESTDRIVER_PORT = 9876 ############################ ## Part 29: RefExtract ## ############################ ## Refextract can automatically submit tickets (after extracting refereces) ## to CFG_REFEXTRACT_TICKET_QUEUE if it is set CFG_REFEXTRACT_TICKET_QUEUE = None ## Override refextract kbs locations CFG_REFEXTRACT_KBS_OVERRIDE = {} ################################## ## Part 30: CrossRef parameters ## ################################## ## CFG_CROSSREF_USERNAME -- the username used when sending request ## to the Crossref site. CFG_CROSSREF_USERNAME = ## CFG_CROSSREF_PASSWORD -- the password used when sending request ## to the Crossref site. CFG_CROSSREF_PASSWORD = ## CFG_CROSSREF_EMAIL -- crossref query services email CFG_CROSSREF_EMAIL = ##################################### ## Part 31: WebLinkback parameters ## ##################################### ## CFG_WEBLINKBACK_TRACKBACK_ENABLED -- whether to enable trackback support ## 1 to enable, 0 to disable it CFG_WEBLINKBACK_TRACKBACK_ENABLED = 0 #################################### ## Part 32: WebSubmit parameters ## #################################### ## CFG_WEBSUBMIT_USE_MATHJAX -- whether to use MathJax and math ## preview panel within submissions (1) or not (0). Customize your ## websubmit_template.tmpl_mathpreview_header() to enable for specific ## fields. ## See also CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS CFG_WEBSUBMIT_USE_MATHJAX = 0 #################################### ## Part 33: BibField parameters ## #################################### ## CFG_BIBFIELD_MASTER_FORMATS -- the name of all the allowed master formats ## that BibField will work with. CFG_BIBFIELD_MASTER_FORMATS = marc #################################### ## Part 34: HEPData parameters ## #################################### ## CFG_HEPDATA_URL -- defines the URL of the server hosting HEPData CFG_HEPDATA_URL = http://hepdata.cedar.ac.uk ## CFG_HEPDATA_PLOTSIZE -- defines the width and height of plots that are ## displayed in the hepdata tab on record pages CFG_HEPDATA_PLOTSIZE = 200 ## CFG_HEPDATA_THREADS_NUM defines the default number of concurrent ## harvesting threads used when retrieving HEPData records. CFG_HEPDATA_THREADS_NUM = 1 ## CFG_HEPDATA_INDEX -- describes a name of the index used to link to ## datasets to their parent records CFG_HEPDATA_INDEX = hepdataparent ## CFG_HEPDATA_FIELD -- The name of the field which should be used for search CFG_HEPDATA_FIELD = hepdataparent ########################## ## Part 35: PdfChecker ## ########################## ## CFG_ARXIV_URL_PATTERN -- the url to use for fetching arxiv pdfs CFG_ARXIV_URL_PATTERN = http://export.arxiv.org/pdf/%sv%s.pdf ########################## ## THAT's ALL, FOLKS! ## ########################## diff --git a/modules/bibupload/lib/bibupload.py b/modules/bibupload/lib/bibupload.py index 57f54c6f1..ca2c5b2fa 100644 --- a/modules/bibupload/lib/bibupload.py +++ b/modules/bibupload/lib/bibupload.py @@ -1,3001 +1,2991 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibUpload: Receive MARC XML file and update the appropriate database tables according to options. """ __revision__ = "$Id$" import os import re import sys import time from datetime import datetime from zlib import compress import socket import marshal import copy import tempfile import urlparse import urllib2 import urllib from invenio.config import CFG_OAI_ID_FIELD, \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, \ CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG, \ CFG_BIBUPLOAD_STRONG_TAGS, \ CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS, \ CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE, \ CFG_BIBUPLOAD_DELETE_FORMATS, \ CFG_SITE_URL, \ CFG_SITE_SECURE_URL, \ CFG_SITE_RECORD, \ CFG_OAI_PROVENANCE_ALTERED_SUBFIELD, \ CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS, \ CFG_BIBUPLOAD_CONFLICTING_REVISION_TICKET_QUEUE, \ CFG_CERN_SITE, \ - CFG_INSPIRE_SITE + CFG_BIBUPLOAD_MATCH_DELETED_RECORDS from invenio.jsonutils import json, CFG_JSON_AVAILABLE from invenio.bibupload_config import CFG_BIBUPLOAD_CONTROLFIELD_TAGS, \ CFG_BIBUPLOAD_SPECIAL_TAGS, \ CFG_BIBUPLOAD_DELETE_CODE, \ CFG_BIBUPLOAD_DELETE_VALUE, \ CFG_BIBUPLOAD_OPT_MODES from invenio.dbquery import run_sql from invenio.bibrecord import create_records, \ record_add_field, \ record_delete_field, \ record_xml_output, \ record_get_field_instances, \ record_get_field_value, \ record_get_field_values, \ field_get_subfield_values, \ field_get_subfield_instances, \ record_modify_subfield, \ record_delete_subfield_from, \ record_delete_fields, \ record_add_subfield_into, \ record_find_field, \ record_extract_oai_id, \ record_extract_dois, \ record_has_field, \ records_identical from invenio.search_engine import get_record, record_exists, search_pattern from invenio.dateutils import convert_datestruct_to_datetext from invenio.errorlib import register_exception from invenio.bibcatalog import BIBCATALOG_SYSTEM from invenio.intbitset import intbitset from invenio.urlutils import make_user_agent_string from invenio.config import CFG_BIBDOCFILE_FILEDIR from invenio.bibtask import task_init, write_message, \ task_set_option, task_get_option, task_get_task_param, \ task_update_progress, task_sleep_now_if_required, fix_argv_paths, \ RecoverableError from invenio.bibdocfile import BibRecDocs, file_strip_ext, normalize_format, \ get_docname_from_url, check_valid_url, download_url, \ KEEP_OLD_VALUE, decompose_bibdocfile_url, InvenioBibDocFileError, \ bibdocfile_url_p, CFG_BIBDOCFILE_AVAILABLE_FLAGS, guess_format_from_url, \ BibRelation, MoreInfo from invenio.search_engine import search_pattern from invenio.bibupload_revisionverifier import RevisionVerifier, \ InvenioBibUploadConflictingRevisionsError, \ InvenioBibUploadInvalidRevisionError, \ InvenioBibUploadMissing005Error, \ InvenioBibUploadUnchangedRecordError #Statistic variables stat = {} stat['nb_records_to_upload'] = 0 stat['nb_records_updated'] = 0 stat['nb_records_inserted'] = 0 stat['nb_errors'] = 0 stat['nb_holdingpen'] = 0 stat['exectime'] = time.localtime() _WRITING_RIGHTS = None CFG_BIBUPLOAD_ALLOWED_SPECIAL_TREATMENTS = ('oracle', ) CFG_HAS_BIBCATALOG = "UNKNOWN" def check_bibcatalog(): """ Return True if bibcatalog is available. """ global CFG_HAS_BIBCATALOG # pylint: disable=W0603 if CFG_HAS_BIBCATALOG != "UNKNOWN": return CFG_HAS_BIBCATALOG CFG_HAS_BIBCATALOG = True if BIBCATALOG_SYSTEM is not None: bibcatalog_response = BIBCATALOG_SYSTEM.check_system() else: bibcatalog_response = "No ticket system configured" if bibcatalog_response != "": write_message("BibCatalog error: %s\n" % (bibcatalog_response,)) CFG_HAS_BIBCATALOG = False return CFG_HAS_BIBCATALOG ## Let's set a reasonable timeout for URL request (e.g. FFT) socket.setdefaulttimeout(40) def parse_identifier(identifier): """Parse the identifier and determine if it is temporary or fixed""" id_str = str(identifier) if not id_str.startswith("TMP:"): return (False, identifier) else: return (True, id_str[4:]) def resolve_identifier(tmps, identifier): """Resolves an identifier. If the identifier is not temporary, this function is an identity on the second argument. Otherwise, a resolved value is returned or an exception raised""" is_tmp, tmp_id = parse_identifier(identifier) if is_tmp: if not tmp_id in tmps: raise StandardError("Temporary identifier %s not present in the dictionary" % (tmp_id, )) if tmps[tmp_id] == -1: # the identifier has been signalised but never assigned a value - probably error during processing raise StandardError("Temporary identifier %s has been declared, but never assigned a value. Probably an error during processign of an appropriate FFT has happened. Please see the log" % (tmp_id, )) return int(tmps[tmp_id]) else: return int(identifier) _re_find_001 = re.compile('\\s*(\\d*)\\s*', re.S) def bibupload_pending_recids(): """This function embed a bit of A.I. and is more a hack than an elegant algorithm. It should be updated in case bibupload/bibsched are modified in incompatible ways. This function return the intbitset of all the records that are being (or are scheduled to be) touched by other bibuploads. """ options = run_sql("""SELECT arguments FROM schTASK WHERE status<>'DONE' AND proc='bibupload' AND (status='RUNNING' OR status='CONTINUING' OR status='WAITING' OR status='SCHEDULED' OR status='ABOUT TO STOP' OR status='ABOUT TO SLEEP')""") ret = intbitset() xmls = [] if options: for arguments in options: arguments = marshal.loads(arguments[0]) for argument in arguments[1:]: if argument.startswith('/'): # XMLs files are recognizable because they're absolute # files... xmls.append(argument) for xmlfile in xmls: # Let's grep for the 001 try: xml = open(xmlfile).read() ret += [int(group[1]) for group in _re_find_001.findall(xml)] except: continue return ret ### bibupload engine functions: def bibupload(record, opt_mode=None, opt_notimechange=0, oai_rec_id="", pretend=False, tmp_ids=None, tmp_vers=None): """Main function: process a record and fit it in the tables bibfmt, bibrec, bibrec_bibxxx, bibxxx with proper record metadata. Return (error_code, recID) of the processed record. """ if tmp_ids is None: tmp_ids = {} if tmp_vers is None: tmp_vers = {} if opt_mode == 'reference': ## NOTE: reference mode has been deprecated in favour of 'correct' opt_mode = 'correct' assert(opt_mode in CFG_BIBUPLOAD_OPT_MODES) try: record_xml_output(record).decode('utf-8') except UnicodeDecodeError: msg = " Failed: Invalid utf-8 characters." write_message(msg, verbose=1, stream=sys.stderr) return (1, -1, msg) error = None affected_tags = {} original_record = {} rec_old = {} now = datetime.now() # will hold record creation/modification date record_had_altered_bit = False is_opt_mode_delete = False # Extraction of the Record Id from 001, SYSNO or OAIID or DOI tags: rec_id = retrieve_rec_id(record, opt_mode, pretend=pretend) if rec_id == -1: msg = " Failed: either the record already exists and insert was " \ "requested or the record does not exists and " \ "replace/correct/append has been used" write_message(msg, verbose=1, stream=sys.stderr) return (1, -1, msg) elif rec_id > 0: write_message(" -Retrieve record ID (found %s): DONE." % rec_id, verbose=2) (unique_p, msg) = check_record_doi_is_unique(rec_id, record) if not unique_p: write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if not record.has_key('001'): # Found record ID by means of SYSNO or OAIID or DOI, and the # input MARCXML buffer does not have this 001 tag, so we # should add it now: error = record_add_field(record, '001', controlfield_value=rec_id) if error is None: msg = " Failed: Error during adding the 001 controlfield " \ "to the record" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: error = None write_message(" -Added tag 001: DONE.", verbose=2) write_message(" -Check if the xml marc file is already in the database: DONE" , verbose=2) record_deleted_p = False if opt_mode == 'insert' or \ (opt_mode == 'replace_or_insert') and rec_id is None: insert_mode_p = True # Insert the record into the bibrec databases to have a recordId rec_id = create_new_record(pretend=pretend) write_message(" -Creation of a new record id (%d): DONE" % rec_id, verbose=2) # we add the record Id control field to the record error = record_add_field(record, '001', controlfield_value=rec_id) if error is None: msg = " Failed: Error during adding the 001 controlfield " \ "to the record" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: error = None if '005' not in record: error = record_add_field(record, '005', controlfield_value=now.strftime("%Y%m%d%H%M%S.0")) if error is None: msg = " ERROR: during adding to 005 controlfield to record" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: error = None else: write_message(" Note: 005 already existing upon inserting of new record. Keeping it.", verbose=2) elif opt_mode != 'insert': insert_mode_p = False # Update Mode # Retrieve the old record to update rec_old = get_record(rec_id) record_had_altered_bit = record_get_field_values(rec_old, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4], CFG_OAI_PROVENANCE_ALTERED_SUBFIELD) # Also save a copy to restore previous situation in case of errors original_record = get_record(rec_id) if rec_old is None: msg = " Failed during the creation of the old record!" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: write_message(" -Retrieve the old record to update: DONE", verbose=2) # flag to check whether the revisions have been verified and patch generated. # If revision verification failed, then we need to manually identify the affected tags # and process them revision_verified = False rev_verifier = RevisionVerifier() #check for revision conflicts before updating record if record_has_field(record, '005') and not CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS: write_message(" -Upload Record has 005. Verifying Revision", verbose=2) try: rev_res = rev_verifier.verify_revision(record, original_record, opt_mode) if rev_res: opt_mode = rev_res[0] record = rev_res[1] affected_tags = rev_res[2] revision_verified = True write_message(lambda: " -Patch record generated. Changing opt_mode to correct.\nPatch:\n%s " % record_xml_output(record), verbose=2) else: write_message(" -No Patch Record.", verbose=2) except InvenioBibUploadUnchangedRecordError, err: msg = " -ISSUE: %s" % err write_message(msg, verbose=1, stream=sys.stderr) write_message(msg, " Continuing anyway in case there are FFT or other tags") except InvenioBibUploadConflictingRevisionsError, err: msg = " -ERROR: Conflicting Revisions - %s" % err write_message(msg, verbose=1, stream=sys.stderr) submit_ticket_for_holding_pen(rec_id, err, "Conflicting Revisions. Inserting record into holding pen.", pretend=pretend) insert_record_into_holding_pen(record, str(rec_id), pretend=pretend) return (2, int(rec_id), msg) except InvenioBibUploadInvalidRevisionError, err: msg = " -ERROR: Invalid Revision - %s" % err write_message(msg) submit_ticket_for_holding_pen(rec_id, err, "Invalid Revisions. Inserting record into holding pen.", pretend=pretend) insert_record_into_holding_pen(record, str(rec_id), pretend=pretend) return (2, int(rec_id), msg) except InvenioBibUploadMissing005Error, err: msg = " -ERROR: Missing 005 - %s" % err write_message(msg) submit_ticket_for_holding_pen(rec_id, err, "Missing 005. Inserting record into holding pen.", pretend=pretend) insert_record_into_holding_pen(record, str(rec_id), pretend=pretend) return (2, int(rec_id), msg) else: write_message(" - No 005 Tag Present. Resuming normal flow.", verbose=2) # dictionaries to temporarily hold original recs tag-fields existing_tags = {} retained_tags = {} # in case of delete operation affected tags should be deleted in delete_bibrec_bibxxx # but should not be updated again in STAGE 4 # utilising the below flag is_opt_mode_delete = False if not revision_verified: # either 005 was not present or opt_mode was not correct/replace # in this case we still need to find out affected tags to process write_message(" - Missing 005 or opt_mode!=Replace/Correct.Revision Verifier not called.", verbose=2) # Identify affected tags if opt_mode == 'correct' or opt_mode == 'replace' or opt_mode == 'replace_or_insert': rec_diff = rev_verifier.compare_records(record, original_record, opt_mode) affected_tags = rev_verifier.retrieve_affected_tags_with_ind(rec_diff) elif opt_mode == 'delete': # populate an intermediate dictionary # used in upcoming step related to 'delete' mode is_opt_mode_delete = True for tag, fields in original_record.iteritems(): existing_tags[tag] = [tag + (field[1] != ' ' and field[1] or '_') + (field[2] != ' ' and field[2] or '_') for field in fields] elif opt_mode == 'append': for tag, fields in record.iteritems(): if tag not in CFG_BIBUPLOAD_CONTROLFIELD_TAGS: affected_tags[tag] = [(field[1], field[2]) for field in fields] # In Replace mode, take over old strong tags if applicable: if opt_mode == 'replace' or \ opt_mode == 'replace_or_insert': copy_strong_tags_from_old_record(record, rec_old) # Delete tags to correct in the record if opt_mode == 'correct': delete_tags_to_correct(record, rec_old) write_message(" -Delete the old tags to correct in the old record: DONE", verbose=2) # Delete tags specified if in delete mode if opt_mode == 'delete': record = delete_tags(record, rec_old) for tag, fields in record.iteritems(): retained_tags[tag] = [tag + (field[1] != ' ' and field[1] or '_') + (field[2] != ' ' and field[2] or '_') for field in fields] #identify the tags that have been deleted for tag in existing_tags.keys(): if tag not in retained_tags: for item in existing_tags[tag]: tag_to_add = item[0:3] ind1, ind2 = item[3], item[4] if tag_to_add in affected_tags and (ind1, ind2) not in affected_tags[tag_to_add]: affected_tags[tag_to_add].append((ind1, ind2)) else: affected_tags[tag_to_add] = [(ind1, ind2)] else: deleted = list(set(existing_tags[tag]) - set(retained_tags[tag])) for item in deleted: tag_to_add = item[0:3] ind1, ind2 = item[3], item[4] if tag_to_add in affected_tags and (ind1, ind2) not in affected_tags[tag_to_add]: affected_tags[tag_to_add].append((ind1, ind2)) else: affected_tags[tag_to_add] = [(ind1, ind2)] write_message(" -Delete specified tags in the old record: DONE", verbose=2) # Append new tag to the old record and update the new record with the old_record modified if opt_mode == 'append' or opt_mode == 'correct': record = append_new_tag_to_old_record(record, rec_old) write_message(" -Append new tags to the old record: DONE", verbose=2) write_message(" -Affected Tags found after comparing upload and original records: %s"%(str(affected_tags)), verbose=2) # 005 tag should be added everytime the record is modified # If an exiting record is modified, its 005 tag should be overwritten with a new revision value if record.has_key('005'): record_delete_field(record, '005') write_message(" Deleted the existing 005 tag.", verbose=2) last_revision = run_sql("SELECT MAX(job_date) FROM hstRECORD WHERE id_bibrec=%s", (rec_id, ))[0][0] if last_revision and last_revision.strftime("%Y%m%d%H%M%S.0") == now.strftime("%Y%m%d%H%M%S.0"): ## We are updating the same record within the same seconds! It's less than ## the minimal granularity. Let's pause for 1 more second to take a breath :-) time.sleep(1) now = datetime.now() error = record_add_field(record, '005', controlfield_value=now.strftime("%Y%m%d%H%M%S.0")) if error is None: write_message(" Failed: Error during adding to 005 controlfield to record", verbose=1, stream=sys.stderr) return (1, int(rec_id)) else: error=None write_message(lambda: " -Added tag 005: DONE. " + str(record_get_field_value(record, '005', '', '')), verbose=2) # adding 005 to affected tags will delete the existing 005 entry # and update with the latest timestamp. if '005' not in affected_tags: affected_tags['005'] = [(' ', ' ')] write_message(" -Stage COMPLETED", verbose=2) record_deleted_p = False try: if not record_is_valid(record): msg = "ERROR: record is not valid" write_message(msg, verbose=1, stream=sys.stderr) return (1, -1, msg) # Have a look if we have FFT tags write_message("Stage 2: Start (Process FFT tags if exist).", verbose=2) record_had_FFT = False bibrecdocs = None if extract_tag_from_record(record, 'FFT') is not None: record_had_FFT = True if not writing_rights_p(): msg = "ERROR: no rights to write fulltext files" write_message(" Stage 2 failed: %s" % msg, verbose=1, stream=sys.stderr) raise StandardError(msg) try: bibrecdocs = BibRecDocs(rec_id) record = elaborate_fft_tags(record, rec_id, opt_mode, pretend=pretend, tmp_ids=tmp_ids, tmp_vers=tmp_vers, bibrecdocs=bibrecdocs) except Exception, e: register_exception() msg = " Stage 2 failed: ERROR: while elaborating FFT tags: %s" % e write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if record is None: msg = " Stage 2 failed: ERROR: while elaborating FFT tags" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Have a look if we have FFT tags write_message("Stage 2B: Start (Synchronize 8564 tags).", verbose=2) if record_had_FFT or extract_tag_from_record(record, '856') is not None: try: if bibrecdocs is None: bibrecdocs = BibRecDocs(rec_id) record = synchronize_8564(rec_id, record, record_had_FFT, bibrecdocs, pretend=pretend) # in case if FFT is in affected list make appropriate changes if not insert_mode_p: # because for insert, all tags are affected if ('4', ' ') not in affected_tags.get('856', []): if '856' not in affected_tags: affected_tags['856'] = [('4', ' ')] elif ('4', ' ') not in affected_tags['856']: affected_tags['856'].append(('4', ' ')) write_message(" -Modified field list updated with FFT details: %s" % str(affected_tags), verbose=2) except Exception, e: register_exception(alert_admin=True) msg = " Stage 2B failed: ERROR: while synchronizing 8564 tags: %s" % e write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if record is None: msg = " Stage 2B failed: ERROR: while synchronizing 8564 tags" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) write_message("Stage 3: Start (Apply fields deletion requests).", verbose=2) write_message(lambda: " Record before deletion:\n%s" % record_xml_output(record), verbose=9) # remove fields with __DELETE_FIELDS__ # NOTE:creating a temporary deep copy of record for iteration to avoid RunTimeError # RuntimeError due to change in dictionary size during iteration tmp_rec = copy.deepcopy(record) for tag in tmp_rec: for data_tuple in record[tag]: if (CFG_BIBUPLOAD_DELETE_CODE, CFG_BIBUPLOAD_DELETE_VALUE) in data_tuple[0]: # delete the tag with particular indicator pairs from original record record_delete_field(record, tag, data_tuple[1], data_tuple[2]) write_message(lambda: " Record after cleaning up fields to be deleted:\n%s" % record_xml_output(record), verbose=9) # Update of the BibFmt write_message("Stage 4: Start (Update bibfmt).", verbose=2) updates_exist = not records_identical(record, original_record) if updates_exist: # if record_had_altered_bit, this must be set to true, since the # record has been altered. if record_had_altered_bit: oai_provenance_fields = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) for oai_provenance_field in oai_provenance_fields: for i, (code, dummy_value) in enumerate(oai_provenance_field[0]): if code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD: oai_provenance_field[0][i] = (code, 'true') tmp_indicators = (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) if tmp_indicators not in affected_tags.get(CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], []): if CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3] not in affected_tags: affected_tags[CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]] = [tmp_indicators] else: affected_tags[CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]].append(tmp_indicators) write_message(lambda: " Updates exists:\n%s\n!=\n%s" % (record, original_record), verbose=9) # format the single record as xml rec_xml_new = record_xml_output(record) # Update bibfmt with the format xm of this record modification_date = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(record_get_field_value(record, '005'), '%Y%m%d%H%M%S.0')) error = update_bibfmt_format(rec_id, rec_xml_new, 'xm', modification_date, pretend=pretend) if error == 1: msg = " Failed: ERROR: during update_bibfmt_format 'xm'" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE: error = update_bibfmt_format(rec_id, marshal.dumps(record), 'recstruct', modification_date, pretend=pretend) if error == 1: msg = " Failed: ERROR: during update_bibfmt_format 'recstruct'" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if not CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS: # archive MARCXML format of this record for version history purposes: if insert_mode_p: error = archive_marcxml_for_history(rec_id, affected_fields={}, pretend=pretend) else: error = archive_marcxml_for_history(rec_id, affected_fields=affected_tags, pretend=pretend) if error == 1: msg = " ERROR: Failed to archive MARCXML for history" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: write_message(" -Archived MARCXML for history: DONE", verbose=2) # delete some formats like HB upon record change: if updates_exist or record_had_FFT: for format_to_delete in CFG_BIBUPLOAD_DELETE_FORMATS: try: delete_bibfmt_format(rec_id, format_to_delete, pretend=pretend) except: # OK, some formats like HB could not have been deleted, no big deal pass write_message(" -Stage COMPLETED", verbose=2) ## Let's assert that one and only one 005 tag is existing at this stage. assert len(record['005']) == 1 # Update the database MetaData write_message("Stage 5: Start (Update the database with the metadata).", verbose=2) if insert_mode_p: update_database_with_metadata(record, rec_id, oai_rec_id, pretend=pretend) write_message(" -Stage COMPLETED", verbose=2) elif opt_mode in ('replace', 'replace_or_insert', 'append', 'correct', 'delete') and updates_exist: # now we clear all the rows from bibrec_bibxxx from the old record_deleted_p = True delete_bibrec_bibxxx(rec_old, rec_id, affected_tags, pretend=pretend) # metadata update will insert tags that are available in affected_tags. # but for delete, once the tags have been deleted from bibrec_bibxxx, they dont have to be inserted # except for 005. if is_opt_mode_delete: tmp_affected_tags = copy.deepcopy(affected_tags) for tag in tmp_affected_tags: if tag != '005': affected_tags.pop(tag) write_message(" -Clean bibrec_bibxxx: DONE", verbose=2) update_database_with_metadata(record, rec_id, oai_rec_id, affected_tags, pretend=pretend) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED in mode %s" % opt_mode, verbose=2) record_deleted_p = False # Finally we update the bibrec table with the current date write_message("Stage 6: Start (Update bibrec table with current date).", verbose=2) if opt_notimechange == 0 and (updates_exist or record_had_FFT): bibrec_now = convert_datestruct_to_datetext(time.localtime()) write_message(" -Retrieved current localtime: DONE", verbose=2) update_bibrec_date(bibrec_now, rec_id, insert_mode_p, pretend=pretend) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Increase statistics if insert_mode_p: stat['nb_records_inserted'] += 1 else: stat['nb_records_updated'] += 1 # Upload of this record finish write_message("Record "+str(rec_id)+" DONE", verbose=1) return (0, int(rec_id), "") finally: if record_deleted_p: ## BibUpload has failed living the record deleted. We should ## back the original record then. update_database_with_metadata(original_record, rec_id, oai_rec_id, pretend=pretend) write_message(" Restored original record", verbose=1, stream=sys.stderr) def record_is_valid(record): """ Check if the record is valid. Currently this simply checks if the record has exactly one rec_id. @param record: the record @type record: recstruct @return: True if the record is valid @rtype: bool """ rec_ids = record_get_field_values(record, tag="001") if len(rec_ids) != 1: write_message(" The record is not valid: it has not a single rec_id: %s" % (rec_ids), stream=sys.stderr) return False return True def find_record_ids_by_oai_id(oaiId): """ A method finding the records identifier provided the oai identifier returns a list of identifiers matching a given oai identifier """ # Is this record already in invenio (matching by oaiid) if oaiId: recids = search_pattern(p=oaiId, f=CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, m='e') # Is this record already in invenio (matching by reportnumber i.e. # particularly 037. Idea: to avoid double insertions) repnumber = oaiId.split(":")[-1] if repnumber: recids |= search_pattern(p = repnumber, f = "reportnumber", m = 'e' ) # Is this record already in invenio (matching by reportnumber i.e. # particularly 037. Idea: to avoid double insertions) repnumber = "arXiv:" + oaiId.split(":")[-1] recids |= search_pattern(p = repnumber, f = "reportnumber", m = 'e' ) - if CFG_INSPIRE_SITE or CFG_CERN_SITE: - ## FIXME: for the time being this functionality is available only on INSPIRE - ## or CDS, and waiting to be replaced by proper pidstore integration + if CFG_BIBUPLOAD_MATCH_DELETED_RECORDS: + return recids + else: if CFG_CERN_SITE: return recids - (search_pattern(p='DELETED', f='980__%', m='e') | search_pattern(p='DUMMY', f='980__%', m='e')) else: return recids - search_pattern(p='DELETED', f='980__%', m='e') - else: - return recids else: return intbitset() def bibupload_post_phase(record, mode=None, rec_id="", pretend=False, tmp_ids=None, tmp_vers=None): def _elaborate_tag(record, tag, fun): if extract_tag_from_record(record, tag) is not None: try: record = fun() except Exception, e: register_exception() write_message(" Stage failed: ERROR: while elaborating %s tags: %s" % (tag, e), verbose=1, stream=sys.stderr) return (1, int(rec_id)) # TODO: ? if record is None: write_message(" Stage failed: ERROR: while elaborating %s tags" % (tag, ), verbose=1, stream=sys.stderr) return (1, int(rec_id)) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) if tmp_ids is None: tmp_ids = {} if tmp_vers is None: tmp_vers = {} _elaborate_tag(record, "BDR", lambda: elaborate_brt_tags(record, rec_id = rec_id, mode = mode, pretend = pretend, tmp_ids = tmp_ids, tmp_vers = tmp_vers)) _elaborate_tag(record, "BDM", lambda: elaborate_mit_tags(record, rec_id = rec_id, mode = mode, pretend = pretend, tmp_ids = tmp_ids, tmp_vers = tmp_vers)) def submit_ticket_for_holding_pen(rec_id, err, msg, pretend=False): """ Submit a ticket via BibCatalog to report about a record that has been put into the Holding Pen. @rec_id: the affected record @err: the corresponding Exception msg: verbose message """ from invenio import bibtask from invenio.webuser import get_email_from_username, get_uid_from_email user = task_get_task_param("user") uid = None if user: try: uid = get_uid_from_email(get_email_from_username(user)) except Exception, err: write_message("WARNING: can't reliably retrieve uid for user %s: %s" % (user, err), stream=sys.stderr) if check_bibcatalog(): text = """ %(msg)s found for record %(rec_id)s: %(err)s See: <%(siteurl)s/record/edit/#state=edit&recid=%(rec_id)s> BibUpload task information: task_id: %(task_id)s task_specific_name: %(task_specific_name)s user: %(user)s task_params: %(task_params)s task_options: %(task_options)s""" % { "msg": msg, "rec_id": rec_id, "err": err, "siteurl": CFG_SITE_SECURE_URL, "task_id": task_get_task_param("task_id"), "task_specific_name": task_get_task_param("task_specific_name"), "user": user, "task_params": bibtask._TASK_PARAMS, "task_options": bibtask._OPTIONS} if not pretend: BIBCATALOG_SYSTEM.ticket_submit(subject="%s: %s by %s" % (msg, rec_id, user), recordid=rec_id, text=text, queue=CFG_BIBUPLOAD_CONFLICTING_REVISION_TICKET_QUEUE, owner=uid) def insert_record_into_holding_pen(record, oai_id, pretend=False): query = "INSERT INTO bibHOLDINGPEN (oai_id, changeset_date, changeset_xml, id_bibrec) VALUES (%s, NOW(), %s, %s)" xml_record = record_xml_output(record) bibrec_ids = find_record_ids_by_oai_id(oai_id) # here determining the identifier of the record if len(bibrec_ids) > 0: bibrec_id = bibrec_ids.pop() else: # id not found by using the oai_id, let's use a wider search based # on any information we might have. bibrec_id = retrieve_rec_id(record, 'holdingpen', pretend=pretend) if bibrec_id is None: bibrec_id = 0 if not pretend: run_sql(query, (oai_id, compress(xml_record), bibrec_id)) # record_id is logged as 0! ( We are not inserting into the main database) log_record_uploading(oai_id, task_get_task_param('task_id', 0), 0, 'H', pretend=pretend) stat['nb_holdingpen'] += 1 def print_out_bibupload_statistics(): """Print the statistics of the process""" out = "Task stats: %(nb_input)d input records, %(nb_updated)d updated, " \ "%(nb_inserted)d inserted, %(nb_errors)d errors, %(nb_holdingpen)d inserted to holding pen. " \ "Time %(nb_sec).2f sec." % { \ 'nb_input': stat['nb_records_to_upload'], 'nb_updated': stat['nb_records_updated'], 'nb_inserted': stat['nb_records_inserted'], 'nb_errors': stat['nb_errors'], 'nb_holdingpen': stat['nb_holdingpen'], 'nb_sec': time.time() - time.mktime(stat['exectime']) } write_message(out) def open_marc_file(path): """Open a file and return the data""" try: # open the file containing the marc document marc_file = open(path, 'r') marc = marc_file.read() marc_file.close() except IOError, erro: write_message("ERROR: %s" % erro, verbose=1, stream=sys.stderr) if erro.errno == 2: # No such file or directory # Not scary e = RecoverableError('File does not exist: %s' % path) else: e = StandardError('File not accessible: %s' % path) raise e return marc def xml_marc_to_records(xml_marc): """create the records""" # Creation of the records from the xml Marc in argument recs = create_records(xml_marc, 1, 1) if recs == []: msg = "ERROR: Cannot parse MARCXML file." write_message(msg, verbose=1, stream=sys.stderr) raise StandardError(msg) elif recs[0][0] is None: msg = "ERROR: MARCXML file has wrong format: %s" % recs write_message(msg, verbose=1, stream=sys.stderr) raise RecoverableError(msg) else: recs = map((lambda x:x[0]), recs) return recs def find_record_format(rec_id, bibformat): """Look whether record REC_ID is formatted in FORMAT, i.e. whether FORMAT exists in the bibfmt table for this record. Return the number of times it is formatted: 0 if not, 1 if yes, 2 if found more than once (should never occur). """ out = 0 query = """SELECT COUNT(*) FROM bibfmt WHERE id_bibrec=%s AND format=%s""" params = (rec_id, bibformat) res = [] res = run_sql(query, params) out = res[0][0] return out def find_record_from_recid(rec_id): """ Try to find record in the database from the REC_ID number. Return record ID if found, None otherwise. """ res = run_sql("SELECT id FROM bibrec WHERE id=%s", (rec_id,)) if res: return res[0][0] else: return None def find_record_from_sysno(sysno): """ Try to find record in the database from the external SYSNO number. Return record ID if found, None otherwise. """ bibxxx = 'bib'+CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:2]+'x' bibrec_bibxxx = 'bibrec_' + bibxxx res = run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, (CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, sysno,)) for recid in res: - if CFG_INSPIRE_SITE or CFG_CERN_SITE: - ## FIXME: for the time being this functionality is available only on INSPIRE - ## or CDS, and waiting to be replaced by proper pidstore integration + if CFG_BIBUPLOAD_MATCH_DELETED_RECORDS: + return recid[0] + else: if record_exists(recid[0]) > 0: ## Only non deleted records return recid[0] - else: - return recid[0] return None def find_records_from_extoaiid(extoaiid, extoaisrc=None): """ Try to find records in the database from the external EXTOAIID number. Return list of record ID if found, None otherwise. """ assert(CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:5] == CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[:5]) bibxxx = 'bib'+CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:2]+'x' bibrec_bibxxx = 'bibrec_' + bibxxx write_message(' Looking for extoaiid="%s" with extoaisrc="%s"' % (extoaiid, extoaisrc), verbose=9) id_bibrecs = intbitset(run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, extoaiid,))) write_message(' Partially found %s for extoaiid="%s"' % (id_bibrecs, extoaiid), verbose=9) ret = intbitset() for id_bibrec in id_bibrecs: - if CFG_INSPIRE_SITE or CFG_CERN_SITE: - ## FIXME: for the time being this functionality is available only on INSPIRE - ## or CDS, and waiting to be replaced by proper pidstore integration + if not CFG_BIBUPLOAD_MATCH_DELETED_RECORDS: if record_exists(id_bibrec) < 1: ## We don't match not existing records continue record = get_record(id_bibrec) instances = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) write_message(' recid %s -> instances "%s"' % (id_bibrec, instances), verbose=9) for instance in instances: this_extoaisrc = field_get_subfield_values(instance, CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5]) this_extoaisrc = this_extoaisrc and this_extoaisrc[0] or None this_extoaiid = field_get_subfield_values(instance, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]) this_extoaiid = this_extoaiid and this_extoaiid[0] or None write_message(" this_extoaisrc -> %s, this_extoaiid -> %s" % (this_extoaisrc, this_extoaiid), verbose=9) if this_extoaiid == extoaiid: write_message(' recid %s -> provenance "%s"' % (id_bibrec, this_extoaisrc), verbose=9) if this_extoaisrc == extoaisrc: write_message('Found recid %s for extoaiid="%s" with provenance="%s"' % (id_bibrec, extoaiid, extoaisrc), verbose=9) ret.add(id_bibrec) break if this_extoaisrc is None: write_message('WARNING: Found recid %s for extoaiid="%s" that doesn\'t specify any provenance, while input record does.' % (id_bibrec, extoaiid), stream=sys.stderr) if extoaisrc is None: write_message('WARNING: Found recid %s for extoaiid="%s" that specify a provenance (%s), while input record does not have a provenance.' % (id_bibrec, extoaiid, this_extoaisrc), stream=sys.stderr) return ret def find_record_from_oaiid(oaiid): """ Try to find record in the database from the OAI ID number and OAI SRC. Return record ID if found, None otherwise. """ bibxxx = 'bib'+CFG_OAI_ID_FIELD[0:2]+'x' bibrec_bibxxx = 'bibrec_' + bibxxx res = run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, (CFG_OAI_ID_FIELD, oaiid,)) for recid in res: - if CFG_INSPIRE_SITE or CFG_CERN_SITE: - ## FIXME: for the time being this functionality is available only on INSPIRE - ## or CDS, and waiting to be replaced by proper pidstore integration + if CFG_BIBUPLOAD_MATCH_DELETED_RECORDS: + return recid[0] + else: if record_exists(recid[0]) > 0: ## Only non deleted records return recid[0] - else: - return recid[0] return None def find_record_from_doi(doi): """ Try to find record in the database from the given DOI. Return record ID if found, None otherwise. """ bibxxx = 'bib02x' bibrec_bibxxx = 'bibrec_' + bibxxx res = run_sql("""SELECT bb.id_bibrec, bb.field_number FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, ('0247_a', doi,)) # For each of the result, make sure that it is really tagged as doi for (id_bibrec, field_number) in res: - if CFG_INSPIRE_SITE or CFG_CERN_SITE: - ## FIXME: for the time being this functionality is available only on INSPIRE - ## or CDS, and waiting to be replaced by proper pidstore integration + if not CFG_BIBUPLOAD_MATCH_DELETED_RECORDS: if record_exists(id_bibrec) < 1: ## We don't match not existing records continue res = run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id and bb.field_number=%%s and bb.id_bibrec=%%s""" % {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, ('0247_2', "doi", field_number, id_bibrec)) if res and res[0][0] == id_bibrec: return res[0][0] return None def extract_tag_from_record(record, tag_number): """ Extract the tag_number for record.""" # first step verify if the record is not already in the database if record: return record.get(tag_number, None) return None def retrieve_rec_id(record, opt_mode, pretend=False, post_phase = False): """Retrieve the record Id from a record by using tag 001 or SYSNO or OAI ID or DOI tag. opt_mod is the desired mode. @param post_phase Tells if we are calling this method in the postprocessing phase. If true, we accept presence of 001 fields even in the insert mode @type post_phase boolean """ rec_id = None # 1st step: we look for the tag 001 tag_001 = extract_tag_from_record(record, '001') if tag_001 is not None: # We extract the record ID from the tag rec_id = tag_001[0][3] # if we are in insert mode => error if opt_mode == 'insert' and not post_phase: write_message(" Failed: tag 001 found in the xml" \ " submitted, you should use the option replace," \ " correct or append to replace an existing" \ " record. (-h for help)", verbose=1, stream=sys.stderr) return -1 else: # we found the rec id and we are not in insert mode => continue # we try to match rec_id against the database: if find_record_from_recid(rec_id) is not None: # okay, 001 corresponds to some known record return int(rec_id) elif opt_mode in ('replace', 'replace_or_insert'): if task_get_option('force'): # we found the rec_id but it's not in the system and we are # requested to replace records. Therefore we create on the fly # a empty record allocating the recid. write_message(" WARNING: tag 001 found in the xml with" " value %(rec_id)s, but rec_id %(rec_id)s does" " not exist. Since the mode replace was" " requested the rec_id %(rec_id)s is allocated" " on-the-fly." % {"rec_id": rec_id}, stream=sys.stderr) return create_new_record(rec_id=rec_id, pretend=pretend) else: # Since --force was not used we are going to raise an error write_message(" Failed: tag 001 found in the xml" " submitted with value %(rec_id)s. The" " corresponding record however does not" " exists. If you want to really create" " such record, please use the --force" " parameter when calling bibupload." % { "rec_id": rec_id}, stream=sys.stderr) return -1 else: # The record doesn't exist yet. We shall have try to check # the SYSNO or OAI or DOI id later. write_message(" -Tag 001 value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag 001 not found in the xml marc file.", verbose=9) if rec_id is None: # 2nd step we look for the SYSNO sysnos = record_get_field_values(record, CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3], CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or "", CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or "", CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6]) if sysnos: sysno = sysnos[0] # there should be only one external SYSNO write_message(" -Checking if SYSNO " + sysno + \ " exists in the database", verbose=9) # try to find the corresponding rec id from the database rec_id = find_record_from_sysno(sysno) if rec_id is not None: # rec_id found pass else: # The record doesn't exist yet. We will try to check # external and internal OAI ids later. write_message(" -Tag SYSNO value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag SYSNO not found in the xml marc file.", verbose=9) if rec_id is None: # 2nd step we look for the external OAIID extoai_fields = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or "", CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or "") if extoai_fields: for field in extoai_fields: extoaiid = field_get_subfield_values(field, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6]) extoaisrc = field_get_subfield_values(field, CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6]) if extoaiid: extoaiid = extoaiid[0] if extoaisrc: extoaisrc = extoaisrc[0] else: extoaisrc = None write_message(" -Checking if EXTOAIID %s (%s) exists in the database" % (extoaiid, extoaisrc), verbose=9) # try to find the corresponding rec id from the database rec_ids = find_records_from_extoaiid(extoaiid, extoaisrc) if rec_ids: # rec_id found rec_id = rec_ids.pop() break else: # The record doesn't exist yet. We will try to check # OAI id later. write_message(" -Tag EXTOAIID value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag EXTOAIID not found in the xml marc file.", verbose=9) if rec_id is None: # 4th step we look for the OAI ID oaiidvalues = record_get_field_values(record, CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4] != "_" and \ CFG_OAI_ID_FIELD[3:4] or "", CFG_OAI_ID_FIELD[4:5] != "_" and \ CFG_OAI_ID_FIELD[4:5] or "", CFG_OAI_ID_FIELD[5:6]) if oaiidvalues: oaiid = oaiidvalues[0] # there should be only one OAI ID write_message(" -Check if local OAI ID " + oaiid + \ " exist in the database", verbose=9) # try to find the corresponding rec id from the database rec_id = find_record_from_oaiid(oaiid) if rec_id is not None: # rec_id found pass else: write_message(" -Tag OAI ID value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag SYSNO not found in the xml marc file.", verbose=9) if rec_id is None: # 5th step we look for the DOI. record_dois = record_extract_dois(record) matching_recids = set() if record_dois: # try to find the corresponding rec id from the database for record_doi in record_dois: possible_recid = find_record_from_doi(record_doi) if possible_recid: matching_recids.add(possible_recid) if len(matching_recids) > 1: # Oops, this record refers to DOI existing in multiple records. # Dunno which one to choose. write_message(" Failed: Multiple records found in the" \ " database %s that match the DOI(s) in the input" \ " MARCXML %s" % (repr(matching_recids), repr(record_dois)), verbose=1, stream=sys.stderr) return -1 elif len(matching_recids) == 1: rec_id = matching_recids.pop() if opt_mode == 'insert': write_message(" Failed: DOI tag matching record #%s found in the xml" \ " submitted, you should use the option replace," \ " correct or append to replace an existing" \ " record. (-h for help)" % rec_id, verbose=1, stream=sys.stderr) return -1 else: write_message(" - Tag DOI value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag DOI not found in the xml marc file.", verbose=9) # Now we should have detected rec_id from SYSNO or OAIID # tags. (None otherwise.) if rec_id: if opt_mode == 'insert': write_message(" Failed: Record found in the database," \ " you should use the option replace," \ " correct or append to replace an existing" \ " record. (-h for help)", verbose=1, stream=sys.stderr) return -1 else: if opt_mode != 'insert' and \ opt_mode != 'replace_or_insert': write_message(" Failed: Record not found in the database."\ " Please insert the file before updating it."\ " (-h for help)", verbose=1, stream=sys.stderr) return -1 return rec_id and int(rec_id) or None def check_record_doi_is_unique(rec_id, record): """ Check that DOI found in 'record' does not exist in any other record than 'recid'. Return (boolean, msg) where 'boolean' would be True if the DOI is unique. """ record_dois = record_extract_dois(record) if record_dois: matching_recids = set() for record_doi in record_dois: possible_recid = find_record_from_doi(record_doi) if possible_recid: matching_recids.add(possible_recid) if len(matching_recids) > 1: # Oops, this record refers to DOI existing in multiple records. msg = " Failed: Multiple records found in the" \ " database %s that match the DOI(s) in the input" \ " MARCXML %s" % (repr(matching_recids), repr(record_dois)) return (False, msg) elif len(matching_recids) == 1: matching_recid = matching_recids.pop() if str(matching_recid) != str(rec_id): # Oops, this record refers to DOI existing in a different record. msg = " Failed: DOI(s) %s found in this record (#%s)" \ " already exist(s) in another other record (#%s)" % \ (repr(record_dois), rec_id, matching_recid) return (False, msg) return (True, "") ### Insert functions def create_new_record(rec_id=None, pretend=False): """ Create new record in the database @param rec_id: if specified the new record will have this rec_id. @type rec_id: int @return: the allocated rec_id @rtype: int @note: in case of errors will be returned None """ if rec_id is not None: try: rec_id = int(rec_id) except (ValueError, TypeError), error: write_message(" ERROR: during the creation_new_record function: %s " % error, verbose=1, stream=sys.stderr) return None if run_sql("SELECT id FROM bibrec WHERE id=%s", (rec_id, )): write_message(" ERROR: during the creation_new_record function: the requested rec_id %s already exists." % rec_id) return None if pretend: if rec_id: return rec_id else: return run_sql("SELECT max(id)+1 FROM bibrec")[0][0] if rec_id is not None: return run_sql("INSERT INTO bibrec (id, creation_date, modification_date) VALUES (%s, NOW(), NOW())", (rec_id, )) else: return run_sql("INSERT INTO bibrec (creation_date, modification_date) VALUES (NOW(), NOW())") def insert_bibfmt(id_bibrec, marc, bibformat, modification_date='1970-01-01 00:00:00', pretend=False): """Insert the format in the table bibfmt""" # compress the marc value pickled_marc = compress(marc) try: time.strptime(modification_date, "%Y-%m-%d %H:%M:%S") except ValueError: modification_date = '1970-01-01 00:00:00' query = """INSERT LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)""" if not pretend: row_id = run_sql(query, (id_bibrec, bibformat, modification_date, pickled_marc)) return row_id else: return 1 def insert_record_bibxxx(tag, value, pretend=False): """Insert the record into bibxxx""" # determine into which table one should insert the record table_name = 'bib'+tag[0:2]+'x' # check if the tag, value combination exists in the table query = """SELECT id,value FROM %s """ % table_name query += """ WHERE tag=%s AND value=%s""" params = (tag, value) res = None res = run_sql(query, params) # Note: compare now the found values one by one and look for # string binary equality (e.g. to respect lowercase/uppercase # match), regardless of the charset etc settings. Ideally we # could use a BINARY operator in the above SELECT statement, but # we would have to check compatibility on various MySQLdb versions # etc; this approach checks all matched values in Python, not in # MySQL, which is less cool, but more conservative, so it should # work better on most setups. if res: for row in res: row_id = row[0] row_value = row[1] if row_value == value: return (table_name, row_id) # We got here only when the tag, value combination was not found, # so it is now necessary to insert the tag, value combination into # bibxxx table as new. query = """INSERT INTO %s """ % table_name query += """ (tag, value) values (%s , %s)""" params = (tag, value) if not pretend: row_id = run_sql(query, params) else: return (table_name, 1) return (table_name, row_id) def insert_record_bibrec_bibxxx(table_name, id_bibxxx, field_number, id_bibrec, pretend=False): """Insert the record into bibrec_bibxxx""" # determine into which table one should insert the record full_table_name = 'bibrec_'+ table_name # insert the proper row into the table query = """INSERT INTO %s """ % full_table_name query += """(id_bibrec,id_bibxxx, field_number) values (%s , %s, %s)""" params = (id_bibrec, id_bibxxx, field_number) if not pretend: res = run_sql(query, params) else: return 1 return res def synchronize_8564(rec_id, record, record_had_FFT, bibrecdocs, pretend=False): """ Synchronize 8564_ tags and BibDocFile tables. This function directly manipulate the record parameter. @type rec_id: positive integer @param rec_id: the record identifier. @param record: the record structure as created by bibrecord.create_record @type record_had_FFT: boolean @param record_had_FFT: True if the incoming bibuploaded-record used FFT @return: the manipulated record (which is also modified as a side effect) """ def merge_marc_into_bibdocfile(field, pretend=False): """ Internal function that reads a single field and stores its content in BibDocFile tables. @param field: the 8564_ field containing a BibDocFile URL. """ write_message('Merging field: %s' % (field, ), verbose=9) url = field_get_subfield_values(field, 'u')[:1] or field_get_subfield_values(field, 'q')[:1] description = field_get_subfield_values(field, 'y')[:1] comment = field_get_subfield_values(field, 'z')[:1] if url: recid, docname, docformat = decompose_bibdocfile_url(url[0]) if recid != rec_id: write_message("INFO: URL %s is not pointing to a fulltext owned by this record (%s)" % (url, recid), stream=sys.stderr) else: try: bibdoc = bibrecdocs.get_bibdoc(docname) if description and not pretend: bibdoc.set_description(description[0], docformat) if comment and not pretend: bibdoc.set_comment(comment[0], docformat) except InvenioBibDocFileError: ## Apparently the referenced docname doesn't exist anymore. ## Too bad. Let's skip it. write_message("WARNING: docname %s does not seem to exist for record %s. Has it been renamed outside FFT?" % (docname, recid), stream=sys.stderr) def merge_bibdocfile_into_marc(field, subfields): """ Internal function that reads BibDocFile table entries referenced by the URL in the given 8564_ field and integrate the given information directly with the provided subfields. @param field: the 8564_ field containing a BibDocFile URL. @param subfields: the subfields corresponding to the BibDocFile URL generated after BibDocFile tables. """ write_message('Merging subfields %s into field %s' % (subfields, field), verbose=9) subfields = dict(subfields) ## We make a copy not to have side-effects subfield_to_delete = [] for subfield_position, (code, value) in enumerate(field_get_subfield_instances(field)): ## For each subfield instance already existing... if code in subfields: ## ...We substitute it with what is in BibDocFile tables record_modify_subfield(record, '856', code, subfields[code], subfield_position, field_position_global=field[4]) del subfields[code] else: ## ...We delete it otherwise subfield_to_delete.append(subfield_position) subfield_to_delete.sort() for counter, position in enumerate(subfield_to_delete): ## FIXME: Very hackish algorithm. Since deleting a subfield ## will alterate the position of following subfields, we ## are taking note of this and adjusting further position ## by using a counter. record_delete_subfield_from(record, '856', position - counter, field_position_global=field[4]) subfields = subfields.items() subfields.sort() for code, value in subfields: ## Let's add non-previously existing subfields record_add_subfield_into(record, '856', code, value, field_position_global=field[4]) def get_bibdocfile_managed_info(): """ Internal function, returns a dictionary of BibDocFile URL -> wanna-be subfields. This information is retrieved from internal BibDoc structures rather than from input MARC XML files @rtype: mapping @return: BibDocFile URL -> wanna-be subfields dictionary """ ret = {} latest_files = bibrecdocs.list_latest_files(list_hidden=False) for afile in latest_files: url = afile.get_url() ret[url] = {'u': url} description = afile.get_description() comment = afile.get_comment() subformat = afile.get_subformat() size = afile.get_size() if description: ret[url]['y'] = description if comment: ret[url]['z'] = comment if subformat: ret[url]['x'] = subformat ret[url]['s'] = str(size) return ret write_message("Synchronizing MARC of recid '%s' with:\n%s" % (rec_id, record), verbose=9) tags856s = record_get_field_instances(record, '856', '%', '%') write_message("Original 856%% instances: %s" % tags856s, verbose=9) tags8564s_to_add = get_bibdocfile_managed_info() write_message("BibDocFile instances: %s" % tags8564s_to_add, verbose=9) positions_tags8564s_to_remove = [] for local_position, field in enumerate(tags856s): if field[1] == '4' and field[2] == ' ': write_message('Analysing %s' % (field, ), verbose=9) for url in field_get_subfield_values(field, 'u') + field_get_subfield_values(field, 'q'): if url in tags8564s_to_add: # there exists a link in the MARC of the record and the connection exists in BibDoc tables if record_had_FFT: merge_bibdocfile_into_marc(field, tags8564s_to_add[url]) else: merge_marc_into_bibdocfile(field, pretend=pretend) del tags8564s_to_add[url] break elif bibdocfile_url_p(url) and decompose_bibdocfile_url(url)[0] == rec_id: # The link exists and is potentially correct-looking link to a document # moreover, it refers to current record id ... but it does not exist in # internal BibDoc structures. This could have happen in the case of renaming a document # or its removal. In both cases we have to remove link... a new one will be created positions_tags8564s_to_remove.append(local_position) write_message("%s to be deleted and re-synchronized" % (field, ), verbose=9) break record_delete_fields(record, '856', positions_tags8564s_to_remove) tags8564s_to_add = tags8564s_to_add.values() tags8564s_to_add.sort() ## FIXME: we are not yet able to preserve the sorting ## of 8564 tags WRT FFT in BibUpload. ## See ticket #1606. for subfields in tags8564s_to_add: subfields = subfields.items() subfields.sort() record_add_field(record, '856', '4', ' ', subfields=subfields) write_message('Final record: %s' % record, verbose=9) return record def _get_subfield_value(field, subfield_code, default=None): res = field_get_subfield_values(field, subfield_code) if res != [] and res != None: return res[0] else: return default def elaborate_mit_tags(record, rec_id, mode, pretend = False, tmp_ids = {}, tmp_vers = {}): """ Uploading MoreInfo -> BDM tags """ tuple_list = extract_tag_from_record(record, 'BDM') # Now gathering information from BDR tags - to be processed later write_message("Processing BDM entries of the record ") recordDocs = BibRecDocs(rec_id) if tuple_list: for mit in record_get_field_instances(record, 'BDM', ' ', ' '): relation_id = _get_subfield_value(mit, "r") bibdoc_id = _get_subfield_value(mit, "i") # checking for a possibly temporary ID if not (bibdoc_id is None): bibdoc_id = resolve_identifier(tmp_ids, bibdoc_id) bibdoc_ver = _get_subfield_value(mit, "v") if not (bibdoc_ver is None): bibdoc_ver = resolve_identifier(tmp_vers, bibdoc_ver) bibdoc_name = _get_subfield_value(mit, "n") bibdoc_fmt = _get_subfield_value(mit, "f") moreinfo_str = _get_subfield_value(mit, "m") if bibdoc_id == None: if bibdoc_name == None: raise StandardError("Incorrect relation. Neither name nor identifier of the first obejct has been specified") else: # retrieving the ID based on the document name (inside current record) # The document is attached to current record. try: bibdoc_id = recordDocs.get_docid(bibdoc_name) except: raise StandardError("BibDoc of a name %s does not exist within a record" % (bibdoc_name, )) else: if bibdoc_name != None: write_message("WARNING: both name and id of the first document of a relation have been specified. Ignoring the name", stream=sys.stderr) if (moreinfo_str is None or mode in ("replace", "correct")) and (not pretend): MoreInfo(docid=bibdoc_id , version = bibdoc_ver, docformat = bibdoc_fmt, relation = relation_id).delete() if (not moreinfo_str is None) and (not pretend): MoreInfo.create_from_serialised(moreinfo_str, docid=bibdoc_id, version = bibdoc_ver, docformat = bibdoc_fmt, relation = relation_id) return record def elaborate_brt_tags(record, rec_id, mode, pretend=False, tmp_ids = {}, tmp_vers = {}): """ Process BDR tags describing relations between existing objects """ tuple_list = extract_tag_from_record(record, 'BDR') # Now gathering information from BDR tags - to be processed later relations_to_create = [] write_message("Processing BDR entries of the record ") recordDocs = BibRecDocs(rec_id) #TODO: check what happens if there is no record yet ! Will the class represent an empty set? if tuple_list: for brt in record_get_field_instances(record, 'BDR', ' ', ' '): relation_id = _get_subfield_value(brt, "r") bibdoc1_id = None bibdoc1_name = None bibdoc1_ver = None bibdoc1_fmt = None bibdoc2_id = None bibdoc2_name = None bibdoc2_ver = None bibdoc2_fmt = None if not relation_id: bibdoc1_id = _get_subfield_value(brt, "i") bibdoc1_name = _get_subfield_value(brt, "n") if bibdoc1_id == None: if bibdoc1_name == None: raise StandardError("Incorrect relation. Neither name nor identifier of the first obejct has been specified") else: # retrieving the ID based on the document name (inside current record) # The document is attached to current record. try: bibdoc1_id = recordDocs.get_docid(bibdoc1_name) except: raise StandardError("BibDoc of a name %s does not exist within a record" % \ (bibdoc1_name, )) else: # resolving temporary identifier bibdoc1_id = resolve_identifier(tmp_ids, bibdoc1_id) if bibdoc1_name != None: write_message("WARNING: both name and id of the first document of a relation have been specified. Ignoring the name", stream=sys.stderr) bibdoc1_ver = _get_subfield_value(brt, "v") if not (bibdoc1_ver is None): bibdoc1_ver = resolve_identifier(tmp_vers, bibdoc1_ver) bibdoc1_fmt = _get_subfield_value(brt, "f") bibdoc2_id = _get_subfield_value(brt, "j") bibdoc2_name = _get_subfield_value(brt, "o") if bibdoc2_id == None: if bibdoc2_name == None: raise StandardError("Incorrect relation. Neither name nor identifier of the second obejct has been specified") else: # retrieving the ID based on the document name (inside current record) # The document is attached to current record. try: bibdoc2_id = recordDocs.get_docid(bibdoc2_name) except: raise StandardError("BibDoc of a name %s does not exist within a record" % (bibdoc2_name, )) else: bibdoc2_id = resolve_identifier(tmp_ids, bibdoc2_id) if bibdoc2_name != None: write_message("WARNING: both name and id of the first document of a relation have been specified. Ignoring the name", stream=sys.stderr) bibdoc2_ver = _get_subfield_value(brt, "w") if not (bibdoc2_ver is None): bibdoc2_ver = resolve_identifier(tmp_vers, bibdoc2_ver) bibdoc2_fmt = _get_subfield_value(brt, "g") control_command = _get_subfield_value(brt, "d") relation_type = _get_subfield_value(brt, "t") if not relation_type and not relation_id: raise StandardError("The relation type must be specified") more_info = _get_subfield_value(brt, "m") # the relation id might be specified in the case of updating # MoreInfo table instead of other fields rel_obj = None if not relation_id: rels = BibRelation.get_relations(rel_type = relation_type, bibdoc1_id = bibdoc1_id, bibdoc2_id = bibdoc2_id, bibdoc1_ver = bibdoc1_ver, bibdoc2_ver = bibdoc2_ver, bibdoc1_fmt = bibdoc1_fmt, bibdoc2_fmt = bibdoc2_fmt) if len(rels) > 0: rel_obj = rels[0] relation_id = rel_obj.id else: rel_obj = BibRelation(rel_id=relation_id) relations_to_create.append((relation_id, bibdoc1_id, bibdoc1_ver, bibdoc1_fmt, bibdoc2_id, bibdoc2_ver, bibdoc2_fmt, relation_type, more_info, rel_obj, control_command)) record_delete_field(record, 'BDR', ' ', ' ') if mode in ("insert", "replace_or_insert", "append", "correct", "replace"): # now creating relations between objects based on the data if not pretend: for (relation_id, bibdoc1_id, bibdoc1_ver, bibdoc1_fmt, bibdoc2_id, bibdoc2_ver, bibdoc2_fmt, rel_type, more_info, rel_obj, control_command) in relations_to_create: if rel_obj == None: rel_obj = BibRelation.create(bibdoc1_id = bibdoc1_id, bibdoc1_ver = bibdoc1_ver, bibdoc1_fmt = bibdoc1_fmt, bibdoc2_id = bibdoc2_id, bibdoc2_ver = bibdoc2_ver, bibdoc2_fmt = bibdoc2_fmt, rel_type = rel_type) relation_id = rel_obj.id if mode in ("replace"): # Clearing existing MoreInfo content rel_obj.get_more_info().delete() if more_info: MoreInfo.create_from_serialised(more_info, relation = relation_id) if control_command == "DELETE": rel_obj.delete() else: write_message("BDR tag is not processed in the %s mode" % (mode, )) return record def elaborate_fft_tags(record, rec_id, mode, pretend=False, tmp_ids = {}, tmp_vers = {}, bibrecdocs=None): """ Process FFT tags that should contain $a with file pathes or URLs to get the fulltext from. This function enriches record with proper 8564 URL tags, downloads fulltext files and stores them into var/data structure where appropriate. CFG_BIBUPLOAD_WGET_SLEEP_TIME defines time to sleep in seconds in between URL downloads. Note: if an FFT tag contains multiple $a subfields, we upload them into different 856 URL tags in the metadata. See regression test case test_multiple_fft_insert_via_http(). """ # Let's define some handy sub procedure. def _add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, modification_date, pretend=False): """Adds a new format for a given bibdoc. Returns True when everything's fine.""" write_message('Add new format to %s url: %s, format: %s, docname: %s, doctype: %s, newname: %s, description: %s, comment: %s, flags: %s, modification_date: %s' % (repr(bibdoc), url, docformat, docname, doctype, newname, description, comment, flags, modification_date), verbose=9) try: if not url: # Not requesting a new url. Just updating comment & description return _update_description_and_comment(bibdoc, docname, docformat, description, comment, flags, pretend=pretend) try: if not pretend: bibdoc.add_file_new_format(url, description=description, comment=comment, flags=flags, modification_date=modification_date) except StandardError, e: write_message("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') not inserted because format already exists (%s)." % (url, docformat, docname, doctype, newname, description, comment, flags, modification_date, e), stream=sys.stderr) raise except Exception, e: write_message("ERROR: in adding '%s' as a new format because of: %s" % (url, e), stream=sys.stderr) raise return True def _add_new_version(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, modification_date, pretend=False): """Adds a new version for a given bibdoc. Returns True when everything's fine.""" write_message('Add new version to %s url: %s, format: %s, docname: %s, doctype: %s, newname: %s, description: %s, comment: %s, flags: %s' % (repr(bibdoc), url, docformat, docname, doctype, newname, description, comment, flags), verbose=9) try: if not url: return _update_description_and_comment(bibdoc, docname, docformat, description, comment, flags, pretend=pretend) try: if not pretend: bibdoc.add_file_new_version(url, description=description, comment=comment, flags=flags, modification_date=modification_date) except StandardError, e: write_message("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') not inserted because '%s'." % (url, docformat, docname, doctype, newname, description, comment, flags, modification_date, e), stream=sys.stderr) raise except Exception, e: write_message("ERROR: in adding '%s' as a new version because of: %s" % (url, e), stream=sys.stderr) raise return True def _update_description_and_comment(bibdoc, docname, docformat, description, comment, flags, pretend=False): """Directly update comments and descriptions.""" write_message('Just updating description and comment for %s with format %s with description %s, comment %s and flags %s' % (docname, docformat, description, comment, flags), verbose=9) try: if not pretend: bibdoc.set_description(description, docformat) bibdoc.set_comment(comment, docformat) for flag in CFG_BIBDOCFILE_AVAILABLE_FLAGS: if flag in flags: bibdoc.set_flag(flag, docformat) else: bibdoc.unset_flag(flag, docformat) except StandardError, e: write_message("('%s', '%s', '%s', '%s', '%s') description and comment not updated because '%s'." % (docname, docformat, description, comment, flags, e)) raise return True def _process_document_moreinfos(more_infos, docname, version, docformat, mode): if not mode in ('correct', 'append', 'replace_or_insert', 'replace', 'correct', 'insert'): print "exited because the mode is incorrect" return docid = None try: docid = bibrecdocs.get_docid(docname) except: raise StandardError("MoreInfo: No document of a given name associated with the record") if not version: # We have to retrieve the most recent version ... version = bibrecdocs.get_bibdoc(docname).get_latest_version() doc_moreinfo_s, version_moreinfo_s, version_format_moreinfo_s, format_moreinfo_s = more_infos if mode in ("replace", "replace_or_insert"): if doc_moreinfo_s: #only if specified, otherwise do not touch MoreInfo(docid = docid).delete() if format_moreinfo_s: #only if specified... otherwise do not touch MoreInfo(docid = docid, docformat = docformat).delete() if not doc_moreinfo_s is None: MoreInfo.create_from_serialised(ser_str = doc_moreinfo_s, docid = docid) if not version_moreinfo_s is None: MoreInfo.create_from_serialised(ser_str = version_moreinfo_s, docid = docid, version = version) if not version_format_moreinfo_s is None: MoreInfo.create_from_serialised(ser_str = version_format_moreinfo_s, docid = docid, version = version, docformat = docformat) if not format_moreinfo_s is None: MoreInfo.create_from_serialised(ser_str = format_moreinfo_s, docid = docid, docformat = docformat) if mode == 'delete': raise StandardError('FFT tag specified but bibupload executed in --delete mode') tuple_list = extract_tag_from_record(record, 'FFT') if tuple_list: # FFT Tags analysis write_message("FFTs: "+str(tuple_list), verbose=9) docs = {} # docnames and their data for fft in record_get_field_instances(record, 'FFT', ' ', ' '): # Very first, we retrieve the potentially temporary odentifiers... #even if the rest fails, we should include them in teh dictionary version = _get_subfield_value(fft, 'v', '') # checking if version is temporary... if so, filling a different varaible is_tmp_ver, bibdoc_tmpver = parse_identifier(version) if is_tmp_ver: version = None else: bibdoc_tmpver = None if not version: #treating cases of empty string etc... version = None bibdoc_tmpid = field_get_subfield_values(fft, 'i') if bibdoc_tmpid: bibdoc_tmpid = bibdoc_tmpid[0] else: bibdoc_tmpid is_tmp_id, bibdoc_tmpid = parse_identifier(bibdoc_tmpid) if not is_tmp_id: bibdoc_tmpid = None # In the case of having temporary id's, we dont resolve them yet but signaklise that they have been used # value -1 means that identifier has been declared but not assigned a value yet if bibdoc_tmpid: if bibdoc_tmpid in tmp_ids: write_message("WARNING: the temporary identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpid, ), stream=sys.stderr) else: tmp_ids[bibdoc_tmpid] = -1 if bibdoc_tmpver: if bibdoc_tmpver in tmp_vers: write_message("WARNING: the temporary version identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpver, ), stream=sys.stderr) else: tmp_vers[bibdoc_tmpver] = -1 # Let's discover the type of the document # This is a legacy field and will not be enforced any particular # check on it. doctype = _get_subfield_value(fft, 't', 'Main') #Default is Main # Let's discover the url. url = field_get_subfield_values(fft, 'a') if url: url = url[0] try: check_valid_url(url) except StandardError, e: raise StandardError, "fft '%s' specifies in $a a location ('%s') with problems: %s" % (fft, url, e) else: url = '' #TODO: a lot of code can be compactified using similar syntax ... should be more readable on the longer scale # maybe right side expressions look a bit cryptic, but the elaborate_fft function would be much clearer if mode == 'correct' and doctype != 'FIX-MARC': arg2 = "" else: arg2 = KEEP_OLD_VALUE description = _get_subfield_value(fft, 'd', arg2) # Let's discover the description # description = field_get_subfield_values(fft, 'd') # if description != []: # description = description[0] # else: # if mode == 'correct' and doctype != 'FIX-MARC': ## If the user require to correct, and do not specify ## a description this means she really want to ## modify the description. # description = '' # else: # description = KEEP_OLD_VALUE # Let's discover the desired docname to be created/altered name = field_get_subfield_values(fft, 'n') if name: ## Let's remove undesired extensions name = file_strip_ext(name[0] + '.pdf') else: if url: name = get_docname_from_url(url) elif mode != 'correct' and doctype != 'FIX-MARC': raise StandardError, "WARNING: fft '%s' doesn't specifies either a location in $a or a docname in $n" % str(fft) else: continue # Let's discover the desired new docname in case we want to change it newname = field_get_subfield_values(fft, 'm') if newname: newname = file_strip_ext(newname[0] + '.pdf') else: newname = name # Let's discover the desired format docformat = field_get_subfield_values(fft, 'f') if docformat: docformat = normalize_format(docformat[0]) else: if url: docformat = guess_format_from_url(url) else: docformat = "" # Let's discover the icon icon = field_get_subfield_values(fft, 'x') if icon != []: icon = icon[0] if icon != KEEP_OLD_VALUE: try: check_valid_url(icon) except StandardError, e: raise StandardError, "fft '%s' specifies in $x an icon ('%s') with problems: %s" % (fft, icon, e) else: icon = '' # Let's discover the comment comment = field_get_subfield_values(fft, 'z') if comment != []: comment = comment[0] else: if mode == 'correct' and doctype != 'FIX-MARC': ## See comment on description comment = '' else: comment = KEEP_OLD_VALUE # Let's discover the restriction restriction = field_get_subfield_values(fft, 'r') if restriction != []: restriction = restriction[0] else: if mode == 'correct' and doctype != 'FIX-MARC': ## See comment on description restriction = '' else: restriction = KEEP_OLD_VALUE document_moreinfo = _get_subfield_value(fft, 'w') version_moreinfo = _get_subfield_value(fft, 'p') version_format_moreinfo = _get_subfield_value(fft, 'b') format_moreinfo = _get_subfield_value(fft, 'u') # Let's discover the timestamp of the file (if any) timestamp = field_get_subfield_values(fft, 's') if timestamp: try: timestamp = datetime(*(time.strptime(timestamp[0], "%Y-%m-%d %H:%M:%S")[:6])) except ValueError: write_message('WARNING: The timestamp is not in a good format, thus will be ignored. The format should be YYYY-MM-DD HH:MM:SS', stream=sys.stderr) timestamp = '' else: timestamp = '' flags = field_get_subfield_values(fft, 'o') for flag in flags: if flag not in CFG_BIBDOCFILE_AVAILABLE_FLAGS: raise StandardError, "fft '%s' specifies a non available flag: %s" % (fft, flag) if docs.has_key(name): # new format considered (doctype2, newname2, restriction2, version2, urls, dummybibdoc_moreinfos2, dummybibdoc_tmpid2, dummybibdoc_tmpver2 ) = docs[name] if doctype2 != doctype: raise StandardError, "fft '%s' specifies a different doctype from previous fft with docname '%s'" % (str(fft), name) if newname2 != newname: raise StandardError, "fft '%s' specifies a different newname from previous fft with docname '%s'" % (str(fft), name) if restriction2 != restriction: raise StandardError, "fft '%s' specifies a different restriction from previous fft with docname '%s'" % (str(fft), name) if version2 != version: raise StandardError, "fft '%s' specifies a different version than the previous fft with docname '%s'" % (str(fft), name) for (dummyurl2, format2, dummydescription2, dummycomment2, dummyflags2, dummytimestamp2) in urls: if docformat == format2: raise StandardError, "fft '%s' specifies a second file '%s' with the same format '%s' from previous fft with docname '%s'" % (str(fft), url, docformat, name) if url or docformat: urls.append((url, docformat, description, comment, flags, timestamp)) if icon: urls.append((icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags, timestamp)) else: if url or docformat: docs[name] = (doctype, newname, restriction, version, [(url, docformat, description, comment, flags, timestamp)], [document_moreinfo, version_moreinfo, version_format_moreinfo, format_moreinfo], bibdoc_tmpid, bibdoc_tmpver) if icon: docs[name][4].append((icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags, timestamp)) elif icon: docs[name] = (doctype, newname, restriction, version, [(icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags, timestamp)], [document_moreinfo, version_moreinfo, version_format_moreinfo, format_moreinfo], bibdoc_tmpid, bibdoc_tmpver) else: docs[name] = (doctype, newname, restriction, version, [], [document_moreinfo, version_moreinfo, version_format_moreinfo, format_moreinfo], bibdoc_tmpid, bibdoc_tmpver) write_message('Result of FFT analysis:\n\tDocs: %s' % (docs,), verbose=9) # Let's remove all FFT tags record_delete_field(record, 'FFT', ' ', ' ') ## Let's pre-download all the URLs to see if, in case of mode 'correct' or 'append' ## we can avoid creating a new revision. for docname, (doctype, newname, restriction, version, urls, more_infos, bibdoc_tmpid, bibdoc_tmpver ) in docs.items(): downloaded_urls = [] try: bibdoc = bibrecdocs.get_bibdoc(docname) except InvenioBibDocFileError: ## A bibdoc with the given docname does not exists. ## So there is no chance we are going to revise an existing ## format with an identical file :-) bibdoc = None new_revision_needed = False for url, docformat, description, comment, flags, timestamp in urls: if url: try: downloaded_url = download_url(url, docformat) write_message("%s saved into %s" % (url, downloaded_url), verbose=9) except Exception, err: write_message("ERROR: in downloading '%s' because of: %s" % (url, err), stream=sys.stderr) raise if mode == 'correct' and bibdoc is not None and not new_revision_needed: downloaded_urls.append((downloaded_url, docformat, description, comment, flags, timestamp)) if not bibrecdocs.check_file_exists(downloaded_url, docformat): new_revision_needed = True else: write_message("WARNING: %s is already attached to bibdoc %s for recid %s" % (url, docname, rec_id), stream=sys.stderr) elif mode == 'append' and bibdoc is not None: if not bibrecdocs.check_file_exists(downloaded_url, docformat): downloaded_urls.append((downloaded_url, docformat, description, comment, flags, timestamp)) else: write_message("WARNING: %s is already attached to bibdoc %s for recid %s" % (url, docname, rec_id), stream=sys.stderr) else: downloaded_urls.append((downloaded_url, docformat, description, comment, flags, timestamp)) else: downloaded_urls.append(('', docformat, description, comment, flags, timestamp)) if mode == 'correct' and bibdoc is not None and not new_revision_needed: ## Since we don't need a new revision (because all the files ## that are being uploaded are different) ## we can simply remove the urls but keep the other information write_message("No need to add a new revision for docname %s for recid %s" % (docname, rec_id), verbose=2) docs[docname] = (doctype, newname, restriction, version, [('', docformat, description, comment, flags, timestamp) for (dummy, docformat, description, comment, flags, timestamp) in downloaded_urls], more_infos, bibdoc_tmpid, bibdoc_tmpver) for downloaded_url, dummy, dummy, dummy, dummy, dummy in downloaded_urls: ## Let's free up some space :-) if downloaded_url and os.path.exists(downloaded_url): os.remove(downloaded_url) else: if downloaded_urls or mode != 'append': docs[docname] = (doctype, newname, restriction, version, downloaded_urls, more_infos, bibdoc_tmpid, bibdoc_tmpver) else: ## In case we are in append mode and there are no urls to append ## we discard the whole FFT del docs[docname] if mode == 'replace': # First we erase previous bibdocs if not pretend: for bibdoc in bibrecdocs.list_bibdocs(): bibdoc.delete() bibrecdocs.dirty = True for docname, (doctype, newname, restriction, version, urls, more_infos, bibdoc_tmpid, bibdoc_tmpver) in docs.iteritems(): write_message("Elaborating olddocname: '%s', newdocname: '%s', doctype: '%s', restriction: '%s', urls: '%s', mode: '%s'" % (docname, newname, doctype, restriction, urls, mode), verbose=9) if mode in ('insert', 'replace'): # new bibdocs, new docnames, new marc if newname in bibrecdocs.get_bibdoc_names(): write_message("('%s', '%s') not inserted because docname already exists." % (newname, urls), stream=sys.stderr) raise StandardError("('%s', '%s') not inserted because docname already exists." % (newname, urls), stream=sys.stderr) try: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, newname) bibdoc.set_status(restriction) else: bibdoc = None except Exception, e: write_message("('%s', '%s', '%s') not inserted because: '%s'." % (doctype, newname, urls, e), stream=sys.stderr) raise e for (url, docformat, description, comment, flags, timestamp) in urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend)) elif mode == 'replace_or_insert': # to be thought as correct_or_insert try: bibdoc = bibrecdocs.get_bibdoc(docname) found_bibdoc = True except InvenioBibDocFileError: found_bibdoc = False else: if doctype not in ('PURGE', 'DELETE', 'EXPUNGE', 'REVERT', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE'): if newname != docname: try: if not pretend: bibrecdocs.change_name(newname=newname, docid=bibdoc.id) write_message(lambda: "After renaming: %s" % bibrecdocs, verbose=9) except StandardError, e: write_message('ERROR: in renaming %s to %s: %s' % (docname, newname, e), stream=sys.stderr) raise try: bibdoc = bibrecdocs.get_bibdoc(newname) found_bibdoc = True except InvenioBibDocFileError: found_bibdoc = False else: if doctype == 'PURGE': if not pretend: bibdoc.purge() bibrecdocs.dirty = True elif doctype == 'DELETE': if not pretend: bibdoc.delete() bibrecdocs.dirty = True elif doctype == 'EXPUNGE': if not pretend: bibdoc.expunge() bibrecdocs.dirty = True elif doctype == 'FIX-ALL': if not pretend: bibrecdocs.fix(docname) elif doctype == 'FIX-MARC': pass elif doctype == 'DELETE-FILE': if urls: for (url, docformat, description, comment, flags, timestamp) in urls: if not pretend: bibdoc.delete_file(docformat, version) elif doctype == 'REVERT': try: if not pretend: bibdoc.revert(version) except Exception, e: write_message('(%s, %s) not correctly reverted: %s' % (newname, version, e), stream=sys.stderr) raise else: if restriction != KEEP_OLD_VALUE: if not pretend: bibdoc.set_status(restriction) # Since the docname already existed we have to first # bump the version by pushing the first new file # then pushing the other files. if urls: (first_url, first_format, first_description, first_comment, first_flags, first_timestamp) = urls[0] other_urls = urls[1:] assert(_add_new_version(bibdoc, first_url, first_format, docname, doctype, newname, first_description, first_comment, first_flags, first_timestamp, pretend=pretend)) for (url, docformat, description, comment, flags, timestamp) in other_urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend)) ## Let's refresh the list of bibdocs. if not found_bibdoc: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, newname) bibdoc.set_status(restriction) for (url, docformat, description, comment, flags, timestamp) in urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp)) elif mode == 'correct': try: bibdoc = bibrecdocs.get_bibdoc(docname) found_bibdoc = True except InvenioBibDocFileError: found_bibdoc = False else: if doctype not in ('PURGE', 'DELETE', 'EXPUNGE', 'REVERT', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE'): if newname != docname: try: if not pretend: bibrecdocs.change_name(newname=newname, docid=bibdoc.id) write_message(lambda: "After renaming: %s" % bibrecdocs, verbose=9) except StandardError, e: write_message('ERROR: in renaming %s to %s: %s' % (docname, newname, e), stream=sys.stderr) raise try: bibdoc = bibrecdocs.get_bibdoc(newname) found_bibdoc = True except InvenioBibDocFileError: found_bibdoc = False else: if doctype == 'PURGE': if not pretend: bibdoc.purge() bibrecdocs.dirty = True elif doctype == 'DELETE': if not pretend: bibdoc.delete() bibrecdocs.dirty = True elif doctype == 'EXPUNGE': if not pretend: bibdoc.expunge() bibrecdocs.dirty = True elif doctype == 'FIX-ALL': if not pretend: bibrecdocs.fix(newname) elif doctype == 'FIX-MARC': pass elif doctype == 'DELETE-FILE': if urls: for (url, docformat, description, comment, flags, timestamp) in urls: if not pretend: bibdoc.delete_file(docformat, version) elif doctype == 'REVERT': try: if not pretend: bibdoc.revert(version) except Exception, e: write_message('(%s, %s) not correctly reverted: %s' % (newname, version, e), stream=sys.stderr) raise else: if restriction != KEEP_OLD_VALUE: if not pretend: bibdoc.set_status(restriction) if doctype and doctype != KEEP_OLD_VALUE: if not pretend: bibdoc.change_doctype(doctype) if urls: (first_url, first_format, first_description, first_comment, first_flags, first_timestamp) = urls[0] other_urls = urls[1:] assert(_add_new_version(bibdoc, first_url, first_format, docname, doctype, newname, first_description, first_comment, first_flags, first_timestamp, pretend=pretend)) for (url, docformat, description, comment, flags, timestamp) in other_urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend)) if not found_bibdoc: if doctype in ('PURGE', 'DELETE', 'EXPUNGE', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE', 'REVERT'): write_message("('%s', '%s', '%s') not performed because '%s' docname didn't existed." % (doctype, newname, urls, docname), stream=sys.stderr) raise StandardError else: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, newname) bibdoc.set_status(restriction) for (url, docformat, description, comment, flags, timestamp) in urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp)) elif mode == 'append': found_bibdoc = False try: bibdoc = bibrecdocs.get_bibdoc(docname) found_bibdoc = True except InvenioBibDocFileError: found_bibdoc = False else: for (url, docformat, description, comment, flags, timestamp) in urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend)) if not found_bibdoc: try: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, docname) bibdoc.set_status(restriction) for (url, docformat, description, comment, flags, timestamp) in urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp)) except Exception, e: register_exception() write_message("('%s', '%s', '%s') not appended because: '%s'." % (doctype, newname, urls, e), stream=sys.stderr) raise if not pretend: _process_document_moreinfos(more_infos, newname, version, urls and urls[0][1], mode) # resolving temporary version and identifier if bibdoc_tmpid: if bibdoc_tmpid in tmp_ids and tmp_ids[bibdoc_tmpid] != -1: write_message("WARNING: the temporary identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpid, ), stream=sys.stderr) else: tmp_ids[bibdoc_tmpid] = bibrecdocs.get_docid(docname) if bibdoc_tmpver: if bibdoc_tmpver in tmp_vers and tmp_vers[bibdoc_tmpver] != -1: write_message("WARNING: the temporary version identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpver, ), stream=sys.stderr) else: if version == None: if version: tmp_vers[bibdoc_tmpver] = version else: tmp_vers[bibdoc_tmpver] = bibrecdocs.get_bibdoc(docname).get_latest_version() else: tmp_vers[bibdoc_tmpver] = version return record ### Update functions def update_bibrec_date(now, bibrec_id, insert_mode_p, pretend=False): """Update the date of the record in bibrec table """ if insert_mode_p: query = """UPDATE bibrec SET creation_date=%s, modification_date=%s WHERE id=%s""" params = (now, now, bibrec_id) else: query = """UPDATE bibrec SET modification_date=%s WHERE id=%s""" params = (now, bibrec_id) if not pretend: run_sql(query, params) write_message(" -Update record creation/modification date: DONE" , verbose=2) def update_bibfmt_format(id_bibrec, format_value, format_name, modification_date=None, pretend=False): """Update the format in the table bibfmt""" if modification_date is None: modification_date = time.strftime('%Y-%m-%d %H:%M:%S') else: try: time.strptime(modification_date, "%Y-%m-%d %H:%M:%S") except ValueError: modification_date = '1970-01-01 00:00:00' # We check if the format is already in bibFmt nb_found = find_record_format(id_bibrec, format_name) if nb_found == 1: # we are going to update the format # compress the format_value value pickled_format_value = compress(format_value) # update the format: query = """UPDATE LOW_PRIORITY bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s""" params = (modification_date, pickled_format_value, id_bibrec, format_name) if not pretend: row_id = run_sql(query, params) if not pretend and row_id is None: write_message(" ERROR: during update_bibfmt_format function", verbose=1, stream=sys.stderr) return 1 else: write_message(" -Update the format %s in bibfmt: DONE" % format_name , verbose=2) return 0 elif nb_found > 1: write_message(" Failed: Same format %s found several time in bibfmt for the same record." % format_name, verbose=1, stream=sys.stderr) return 1 else: # Insert the format information in BibFMT res = insert_bibfmt(id_bibrec, format_value, format_name, modification_date, pretend=pretend) if res is None: write_message(" ERROR: during insert_bibfmt", verbose=1, stream=sys.stderr) return 1 else: write_message(" -Insert the format %s in bibfmt: DONE" % format_name , verbose=2) return 0 def delete_bibfmt_format(id_bibrec, format_name, pretend=False): """ Delete format FORMAT_NAME from bibfmt table fo record ID_BIBREC. """ if not pretend: run_sql("DELETE LOW_PRIORITY FROM bibfmt WHERE id_bibrec=%s and format=%s", (id_bibrec, format_name)) return 0 def archive_marcxml_for_history(recID, affected_fields, pretend=False): """ Archive current MARCXML format of record RECID from BIBFMT table into hstRECORD table. Useful to keep MARCXML history of records. Return 0 if everything went fine. Return 1 otherwise. """ res = run_sql("SELECT id_bibrec, value, last_updated FROM bibfmt WHERE format='xm' AND id_bibrec=%s", (recID,)) db_affected_fields = "" if affected_fields: tmp_affected_fields = {} for field in affected_fields: if field.isdigit(): #hack for tags from RevisionVerifier for ind in affected_fields[field]: tmp_affected_fields[(field + ind[0] + ind[1] + "%").replace(" ", "_")] = 1 else: pass #future implementation for fields tmp_affected_fields = tmp_affected_fields.keys() tmp_affected_fields.sort() db_affected_fields = ",".join(tmp_affected_fields) if res and not pretend: run_sql("""INSERT INTO hstRECORD (id_bibrec, marcxml, job_id, job_name, job_person, job_date, job_details, affected_fields) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)""", (res[0][0], res[0][1], task_get_task_param('task_id', 0), 'bibupload', task_get_task_param('user', 'UNKNOWN'), res[0][2], 'mode: ' + task_get_option('mode', 'UNKNOWN') + '; file: ' + task_get_option('file_path', 'UNKNOWN') + '.', db_affected_fields)) return 0 def update_database_with_metadata(record, rec_id, oai_rec_id="oai", affected_tags=None, pretend=False): """Update the database tables with the record and the record id given in parameter""" # extract only those tags that have been affected. # check happens at subfield level. This is to prevent overhead # associated with inserting already existing field with given ind pair write_message("update_database_with_metadata: record=%s, rec_id=%s, oai_rec_id=%s, affected_tags=%s" % (record, rec_id, oai_rec_id, affected_tags), verbose=9) tmp_record = {} if affected_tags: for tag in record.keys(): if tag in affected_tags.keys(): write_message(" -Tag %s found to be modified.Setting up for update" % tag, verbose=9) # initialize new list to hold affected field new_data_tuple_list = [] for data_tuple in record[tag]: ind1 = data_tuple[1] ind2 = data_tuple[2] if (ind1, ind2) in affected_tags[tag]: write_message(" -Indicator pair (%s, %s) added to update list" % (ind1, ind2), verbose=9) new_data_tuple_list.append(data_tuple) tmp_record[tag] = new_data_tuple_list write_message(lambda: " -Modified fields: \n%s" % record_xml_output(tmp_record), verbose=2) else: tmp_record = record for tag in tmp_record.keys(): # check if tag is not a special one: if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS: # for each tag there is a list of tuples representing datafields tuple_list = tmp_record[tag] # this list should contain the elements of a full tag [tag, ind1, ind2, subfield_code] tag_list = [] tag_list.append(tag) for single_tuple in tuple_list: # these are the contents of a single tuple subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # append the ind's to the full tag if ind1 == '' or ind1 == ' ': tag_list.append('_') else: tag_list.append(ind1) if ind2 == '' or ind2 == ' ': tag_list.append('_') else: tag_list.append(ind2) datafield_number = single_tuple[4] if tag in CFG_BIBUPLOAD_SPECIAL_TAGS: # nothing to do for special tags (FFT, BDR, BDM) pass elif tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS and tag != "001": value = single_tuple[3] # get the full tag full_tag = ''.join(tag_list) # update the tables write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9) # insert the tag and value into into bibxxx (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value, pretend=pretend) #print 'tname, bibrow', table_name, bibxxx_row_id; if table_name is None or bibxxx_row_id is None: write_message(" Failed: during insert_record_bibxxx", verbose=1, stream=sys.stderr) # connect bibxxx and bibrec with the table bibrec_bibxxx res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id, pretend=pretend) if res is None: write_message(" Failed: during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr) else: # get the tag and value from the content of each subfield for subfield in subfield_list: subtag = subfield[0] value = subfield[1] tag_list.append(subtag) # get the full tag full_tag = ''.join(tag_list) # update the tables write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9) # insert the tag and value into into bibxxx (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value, pretend=pretend) if table_name is None or bibxxx_row_id is None: write_message(" Failed: during insert_record_bibxxx", verbose=1, stream=sys.stderr) # connect bibxxx and bibrec with the table bibrec_bibxxx res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id, pretend=pretend) if res is None: write_message(" Failed: during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr) # remove the subtag from the list tag_list.pop() tag_list.pop() tag_list.pop() tag_list.pop() write_message(" -Update the database with metadata: DONE", verbose=2) log_record_uploading(oai_rec_id, task_get_task_param('task_id', 0), rec_id, 'P', pretend=pretend) def append_new_tag_to_old_record(record, rec_old): """Append new tags to a old record""" def _append_tag(tag): if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS: if tag == '001': pass else: # if it is a controlfield, just access the value for single_tuple in record[tag]: controlfield_value = single_tuple[3] # add the field to the old record newfield_number = record_add_field(rec_old, tag, controlfield_value=controlfield_value) if newfield_number is None: write_message(" ERROR: when adding the field"+tag, verbose=1, stream=sys.stderr) else: # For each tag there is a list of tuples representing datafields for single_tuple in record[tag]: # We retrieve the information of the tag subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] if '%s%s%s' % (tag, ind1 == ' ' and '_' or ind1, ind2 == ' ' and '_' or ind2) in (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:5], CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[:5]): ## We don't want to append the external identifier ## if it is already existing. if record_find_field(rec_old, tag, single_tuple)[0] is not None: write_message(" Not adding tag: %s ind1=%s ind2=%s subfields=%s: it's already there" % (tag, ind1, ind2, subfield_list), verbose=9) continue # We add the datafield to the old record write_message(" Adding tag: %s ind1=%s ind2=%s subfields=%s" % (tag, ind1, ind2, subfield_list), verbose=9) newfield_number = record_add_field(rec_old, tag, ind1, ind2, subfields=subfield_list) if newfield_number is None: write_message(" ERROR: when adding the field"+tag, verbose=1, stream=sys.stderr) # Go through each tag in the appended record for tag in record: _append_tag(tag) return rec_old def copy_strong_tags_from_old_record(record, rec_old): """ Look for strong tags in RECORD and REC_OLD. If no strong tags are found in RECORD, then copy them over from REC_OLD. This function modifies RECORD structure on the spot. """ for strong_tag in CFG_BIBUPLOAD_STRONG_TAGS: if not record_get_field_instances(record, strong_tag, strong_tag[3:4] or '%', strong_tag[4:5] or '%'): strong_tag_old_field_instances = record_get_field_instances(rec_old, strong_tag) if strong_tag_old_field_instances: for strong_tag_old_field_instance in strong_tag_old_field_instances: sf_vals, fi_ind1, fi_ind2, controlfield, dummy = strong_tag_old_field_instance record_add_field(record, strong_tag, fi_ind1, fi_ind2, controlfield, sf_vals) return ### Delete functions def delete_tags(record, rec_old): """ Returns a record structure with all the fields in rec_old minus the fields in record. @param record: The record containing tags to delete. @type record: record structure @param rec_old: The original record. @type rec_old: record structure @return: The modified record. @rtype: record structure """ returned_record = copy.deepcopy(rec_old) for tag, fields in record.iteritems(): if tag in ('001', ): continue for field in fields: local_position = record_find_field(returned_record, tag, field)[1] if local_position is not None: record_delete_field(returned_record, tag, field_position_local=local_position) return returned_record def delete_tags_to_correct(record, rec_old): """ Delete tags from REC_OLD which are also existing in RECORD. When deleting, pay attention not only to tags, but also to indicators, so that fields with the same tags but different indicators are not deleted. """ ## Some fields are controlled via provenance information. ## We should re-add saved fields at the end. fields_to_readd = {} for tag in CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS: if tag[:3] in record: tmp_field_instances = record_get_field_instances(record, tag[:3], tag[3], tag[4]) ## Let's discover the provenance that will be updated provenances_to_update = [] for instance in tmp_field_instances: for code, value in instance[0]: if code == tag[5]: if value not in provenances_to_update: provenances_to_update.append(value) break else: ## The provenance is not specified. ## let's add the special empty provenance. if '' not in provenances_to_update: provenances_to_update.append('') potential_fields_to_readd = record_get_field_instances(rec_old, tag[:3], tag[3], tag[4]) ## Let's take all the field corresponding to tag ## Let's save apart all the fields that should be updated, but ## since they have a different provenance not mentioned in record ## they should be preserved. fields = [] for sf_vals, ind1, ind2, dummy_cf, dummy_line in potential_fields_to_readd: for code, value in sf_vals: if code == tag[5]: if value not in provenances_to_update: fields.append(sf_vals) break else: if '' not in provenances_to_update: ## Empty provenance, let's protect in any case fields.append(sf_vals) fields_to_readd[tag] = fields # browse through all the tags from the MARCXML file: for tag in record: # check if the tag exists in the old record too: if tag in rec_old and tag != '001': # the tag does exist, so delete all record's tag+ind1+ind2 combinations from rec_old for dummy_sf_vals, ind1, ind2, dummy_cf, dummyfield_number in record[tag]: write_message(" Delete tag: " + tag + " ind1=" + ind1 + " ind2=" + ind2, verbose=9) record_delete_field(rec_old, tag, ind1, ind2) ## Ok, we readd necessary fields! for tag, fields in fields_to_readd.iteritems(): for sf_vals in fields: write_message(" Adding tag: " + tag[:3] + " ind1=" + tag[3] + " ind2=" + tag[4] + " code=" + str(sf_vals), verbose=9) record_add_field(rec_old, tag[:3], tag[3], tag[4], subfields=sf_vals) def delete_bibrec_bibxxx(record, id_bibrec, affected_tags={}, pretend=False): """Delete the database record from the table bibxxx given in parameters""" # we clear all the rows from bibrec_bibxxx from the old record # clearing only those tags that have been modified. write_message(lambda: "delete_bibrec_bibxxx(record=%s, id_bibrec=%s, affected_tags=%s)" % (record, id_bibrec, affected_tags), verbose=9) for tag in affected_tags: # sanity check with record keys just to make sure its fine. if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS: write_message("%s found in record"%tag, verbose=2) # for each name construct the bibrec_bibxxx table name table_name = 'bib'+tag[0:2]+'x' bibrec_table = 'bibrec_'+table_name # delete all the records with proper id_bibrec. Indicators matter for individual affected tags tmp_ind_1 = '' tmp_ind_2 = '' # construct exact tag value using indicators for ind_pair in affected_tags[tag]: if ind_pair[0] == ' ': tmp_ind_1 = '_' else: tmp_ind_1 = ind_pair[0] if ind_pair[1] == ' ': tmp_ind_2 = '_' else: tmp_ind_2 = ind_pair[1] # need to escape incase of underscore so that mysql treats it as a char tag_val = tag+"\\"+tmp_ind_1+"\\"+tmp_ind_2 + '%' query = """DELETE br.* FROM `%s` br,`%s` b where br.id_bibrec=%%s and br.id_bibxxx=b.id and b.tag like %%s""" % (bibrec_table, table_name) params = (id_bibrec, tag_val) write_message(query % params, verbose=9) if not pretend: run_sql(query, params) else: write_message("%s not found"%tag, verbose=2) def main(): """Main that construct all the bibtask.""" task_init(authorization_action='runbibupload', authorization_msg="BibUpload Task Submission", description="""Receive MARC XML file and update appropriate database tables according to options. Examples: $ bibupload -i input.xml """, help_specific_usage=""" -a, --append\t\tnew fields are appended to the existing record -c, --correct\t\tfields are replaced by the new ones in the existing record, except \t\t\twhen overridden by CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS -i, --insert\t\tinsert the new record in the database -r, --replace\t\tthe existing record is entirely replaced by the new one, \t\t\texcept for fields in CFG_BIBUPLOAD_STRONG_TAGS -d, --delete\t\tspecified fields are deleted in existing record -n, --notimechange\tdo not change record last modification date when updating -o, --holdingpen\tInsert record into holding pen instead of the normal database --pretend\t\tdo not really insert/append/correct/replace the input file --force\t\twhen --replace, use provided 001 tag values, even if the matching \t\t\trecord does not exist (thus allocating it on-the-fly) --callback-url\tSend via a POST request a JSON-serialized answer (see admin guide), in \t\t\torder to provide a feedback to an external service about the outcome of the operation. --nonce\t\twhen used together with --callback add the nonce value in the JSON message. --special-treatment=MODE\tif "oracle" is specified, when used together with --callback_url, \t\t\tPOST an application/x-www-form-urlencoded request where the JSON message is encoded \t\t\tinside a form field called "results". """, version=__revision__, specific_params=("ircazdnoS:", [ "insert", "replace", "correct", "append", "reference", "delete", "notimechange", "holdingpen", "pretend", "force", "callback-url=", "nonce=", "special-treatment=", "stage=", ]), task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter, task_run_fnc=task_run_core, task_submit_check_options_fnc=task_submit_check_options) def task_submit_elaborate_specific_parameter(key, value, opts, args): # pylint: disable=W0613 """ Given the string key it checks it's meaning, eventually using the value. Usually it fills some key in the options dict. It must return True if it has elaborated the key, False, if it doesn't know that key. eg: if key in ['-n', '--number']: task_get_option(\1) = value return True return False """ # No time change option if key in ("-n", "--notimechange"): task_set_option('notimechange', 1) # Insert mode option elif key in ("-i", "--insert"): if task_get_option('mode') == 'replace': # if also replace found, then set to replace_or_insert task_set_option('mode', 'replace_or_insert') else: task_set_option('mode', 'insert') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Replace mode option elif key in ("-r", "--replace"): if task_get_option('mode') == 'insert': # if also insert found, then set to replace_or_insert task_set_option('mode', 'replace_or_insert') else: task_set_option('mode', 'replace') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Holding pen mode option elif key in ("-o", "--holdingpen"): write_message("Holding pen mode", verbose=3) task_set_option('mode', 'holdingpen') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Correct mode option elif key in ("-c", "--correct"): task_set_option('mode', 'correct') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Append mode option elif key in ("-a", "--append"): task_set_option('mode', 'append') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Deprecated reference mode option (now correct) elif key in ("-z", "--reference"): task_set_option('mode', 'correct') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("-d", "--delete"): task_set_option('mode', 'delete') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("--pretend",): task_set_option('pretend', True) fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("--force",): task_set_option('force', True) fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("--callback-url", ): task_set_option('callback_url', value) elif key in ("--nonce", ): task_set_option('nonce', value) elif key in ("--special-treatment", ): if value.lower() in CFG_BIBUPLOAD_ALLOWED_SPECIAL_TREATMENTS: if value.lower() == 'oracle': task_set_option('oracle_friendly', True) else: print >> sys.stderr, """The specified value is not in the list of allowed special treatments codes: %s""" % CFG_BIBUPLOAD_ALLOWED_SPECIAL_TREATMENTS return False elif key in ("-S", "--stage"): print >> sys.stderr, """WARNING: the --stage parameter is deprecated and ignored.""" else: return False return True def task_submit_check_options(): """ Reimplement this method for having the possibility to check options before submitting the task, in order for example to provide default values. It must return False if there are errors in the options. """ if task_get_option('mode') is None: write_message("Please specify at least one update/insert mode!", stream=sys.stderr) return False file_path = task_get_option('file_path') if file_path is None: write_message("Missing filename! -h for help.", stream=sys.stderr) return False try: open(file_path).read().decode('utf-8') except IOError: write_message("""File is not accessible: %s""" % file_path, stream=sys.stderr) return False except UnicodeDecodeError: write_message("""File encoding is not valid utf-8: %s""" % file_path, stream=sys.stderr) return False return True def writing_rights_p(): """Return True in case bibupload has the proper rights to write in the fulltext file folder.""" if _WRITING_RIGHTS is not None: return _WRITING_RIGHTS try: if not os.path.exists(CFG_BIBDOCFILE_FILEDIR): os.makedirs(CFG_BIBDOCFILE_FILEDIR) fd, filename = tempfile.mkstemp(suffix='.txt', prefix='test', dir=CFG_BIBDOCFILE_FILEDIR) test = os.fdopen(fd, 'w') test.write('TEST') test.close() if open(filename).read() != 'TEST': raise IOError("Can not successfully write and readback %s" % filename) os.remove(filename) except: register_exception(alert_admin=True) return False return True def post_results_to_callback_url(results, callback_url): write_message("Sending feedback to %s" % callback_url) if not CFG_JSON_AVAILABLE: from warnings import warn warn("--callback-url used but simplejson/json not available") return json_results = json.dumps(results) write_message("Message to send: %s" % json_results, verbose=9) ## :///?# scheme, dummynetloc, dummypath, dummyquery, dummyfragment = urlparse.urlsplit(callback_url) ## See: http://stackoverflow.com/questions/111945/is-there-any-way-to-do-http-put-in-python if scheme == 'http': opener = urllib2.build_opener(urllib2.HTTPHandler) elif scheme == 'https': opener = urllib2.build_opener(urllib2.HTTPSHandler) else: raise ValueError("Scheme not handled %s for callback_url %s" % (scheme, callback_url)) if task_get_option('oracle_friendly'): write_message("Oracle friendly mode requested", verbose=9) request = urllib2.Request(callback_url, data=urllib.urlencode({'results': json_results})) request.add_header('Content-Type', 'application/x-www-form-urlencoded') else: request = urllib2.Request(callback_url, data=json_results) request.add_header('Content-Type', 'application/json') request.add_header('User-Agent', make_user_agent_string('BibUpload')) write_message("Headers about to be sent: %s" % request.headers, verbose=9) write_message("Data about to be sent: %s" % request.data, verbose=9) res = opener.open(request) msg = res.read() write_message("Result of posting the feedback: %s %s" % (res.code, res.msg), verbose=9) write_message("Returned message is: %s" % msg, verbose=9) return res def bibupload_records(records, opt_mode=None, opt_notimechange=0, pretend=False, callback_url=None, results_for_callback=None): """perform the task of uploading a set of records returns list of (error_code, recid) tuples for separate records """ #Dictionaries maintaining temporary identifiers # Structure: identifier -> number tmp_ids = {} tmp_vers = {} results = [] # The first phase -> assigning meaning to temporary identifiers if opt_mode == 'reference': ## NOTE: reference mode has been deprecated in favour of 'correct' opt_mode = 'correct' record = None for record in records: record_id = record_extract_oai_id(record) task_sleep_now_if_required(can_stop_too=True) if opt_mode == "holdingpen": #inserting into the holding pen write_message("Inserting into holding pen", verbose=3) insert_record_into_holding_pen(record, record_id, pretend=pretend) else: write_message("Inserting into main database", verbose=3) error = bibupload( record, opt_mode = opt_mode, opt_notimechange = opt_notimechange, oai_rec_id = record_id, pretend = pretend, tmp_ids = tmp_ids, tmp_vers = tmp_vers) results.append(error) if error[0] == 1: if record: write_message(lambda: record_xml_output(record), stream=sys.stderr) else: write_message("Record could not have been parsed", stream=sys.stderr) stat['nb_errors'] += 1 if callback_url: results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]}) elif error[0] == 2: if record: write_message(lambda: record_xml_output(record), stream=sys.stderr) else: write_message("Record could not have been parsed", stream=sys.stderr) if callback_url: results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]}) elif error[0] == 0: if callback_url: from invenio.search_engine import print_record results_for_callback['results'].append({'recid': error[1], 'success': True, "marcxml": print_record(error[1], 'xm'), 'url': "%s/%s/%s" % (CFG_SITE_URL, CFG_SITE_RECORD, error[1])}) else: if callback_url: results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]}) # stat us a global variable task_update_progress("Done %d out of %d." % \ (stat['nb_records_inserted'] + \ stat['nb_records_updated'], stat['nb_records_to_upload'])) # Second phase -> Now we can process all entries where temporary identifiers might appear (BDR, BDM) write_message("Identifiers table after processing: %s versions: %s" % (str(tmp_ids), str(tmp_vers)), verbose=2) write_message("Uploading BDR and BDM fields") if opt_mode != "holdingpen": for record in records: record_id = retrieve_rec_id(record, opt_mode, pretend=pretend, post_phase = True) bibupload_post_phase(record, rec_id = record_id, mode = opt_mode, pretend = pretend, tmp_ids = tmp_ids, tmp_vers = tmp_vers) return results def task_run_core(): """ Reimplement to add the body of the task.""" write_message("Input file '%s', input mode '%s'." % (task_get_option('file_path'), task_get_option('mode'))) write_message("STAGE 0:", verbose=2) if task_get_option('file_path') is not None: write_message("start preocessing", verbose=3) task_update_progress("Reading XML input") recs = xml_marc_to_records(open_marc_file(task_get_option('file_path'))) stat['nb_records_to_upload'] = len(recs) write_message(" -Open XML marc: DONE", verbose=2) task_sleep_now_if_required(can_stop_too=True) write_message("Entering records loop", verbose=3) callback_url = task_get_option('callback_url') results_for_callback = {'results': []} if recs is not None: # We proceed each record by record bibupload_records(records=recs, opt_mode=task_get_option('mode'), opt_notimechange=task_get_option('notimechange'), pretend=task_get_option('pretend'), callback_url=callback_url, results_for_callback=results_for_callback) else: write_message(" ERROR: bibupload failed: No record found", verbose=1, stream=sys.stderr) callback_url = task_get_option("callback_url") if callback_url: nonce = task_get_option("nonce") if nonce: results_for_callback["nonce"] = nonce post_results_to_callback_url(results_for_callback, callback_url) if task_get_task_param('verbose') >= 1: # Print out the statistics print_out_bibupload_statistics() # Check if they were errors return not stat['nb_errors'] >= 1 def log_record_uploading(oai_rec_id, task_id, bibrec_id, insertion_db, pretend=False): if oai_rec_id != "" and oai_rec_id != None: query = """UPDATE oaiHARVESTLOG SET date_inserted=NOW(), inserted_to_db=%s, id_bibrec=%s WHERE oai_id = %s AND bibupload_task_id = %s ORDER BY date_harvested LIMIT 1""" if not pretend: run_sql(query, (str(insertion_db), str(bibrec_id), str(oai_rec_id), str(task_id), )) if __name__ == "__main__": main()