diff --git a/config/invenio.conf b/config/invenio.conf index ec5f1c99a..d7a235644 100644 --- a/config/invenio.conf +++ b/config/invenio.conf @@ -1,2249 +1,2257 @@ ## This file is part of Invenio. ## Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ################################################### ## About 'invenio.conf' and 'invenio-local.conf' ## ################################################### ## The 'invenio.conf' file contains the vanilla default configuration ## parameters of a Invenio installation, as coming out of the ## distribution. The file should be self-explanatory. Once installed ## in its usual location (usually /opt/invenio/etc), you could in ## principle go ahead and change the values according to your local ## needs, but this is not advised. ## ## If you would like to customize some of these parameters, you should ## rather create a file named 'invenio-local.conf' in the same ## directory where 'invenio.conf' lives and you should write there ## only the customizations that you want to be different from the ## vanilla defaults. ## ## Here is a realistic, minimalist, yet production-ready example of ## what you would typically put there: ## ## $ cat /opt/invenio/etc/invenio-local.conf ## [Invenio] ## CFG_SITE_NAME = John Doe's Document Server ## CFG_SITE_NAME_INTL_fr = Serveur des Documents de John Doe ## CFG_SITE_URL = http://your.site.com ## CFG_SITE_SECURE_URL = https://your.site.com ## CFG_SITE_ADMIN_EMAIL = john.doe@your.site.com ## CFG_SITE_SUPPORT_EMAIL = john.doe@your.site.com ## CFG_WEBALERT_ALERT_ENGINE_EMAIL = john.doe@your.site.com ## CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = john.doe@your.site.com ## CFG_WEBCOMMENT_DEFAULT_MODERATOR = john.doe@your.site.com ## CFG_DATABASE_HOST = localhost ## CFG_DATABASE_NAME = invenio ## CFG_DATABASE_USER = invenio ## CFG_DATABASE_PASS = my123p$ss ## ## You should override at least the parameters mentioned above and the ## parameters mentioned in the `Part 1: Essential parameters' below in ## order to define some very essential runtime parameters such as the ## name of your document server (CFG_SITE_NAME and ## CFG_SITE_NAME_INTL_*), the visible URL of your document server ## (CFG_SITE_URL and CFG_SITE_SECURE_URL), the email address of the ## local Invenio administrator, comment moderator, and alert engine ## (CFG_SITE_SUPPORT_EMAIL, CFG_SITE_ADMIN_EMAIL, etc), and last but ## not least your database credentials (CFG_DATABASE_*). ## ## The Invenio system will then read both the default invenio.conf ## file and your customized invenio-local.conf file and it will ## override any default options with the ones you have specified in ## your local file. This cascading of configuration parameters will ## ease your future upgrades. [Invenio] ################################### ## Part 1: Essential parameters ## ################################### ## This part defines essential Invenio internal parameters that ## everybody should override, like the name of the server or the email ## address of the local Invenio administrator. ## CFG_DATABASE_* - specify which MySQL server to use, the name of the ## database to use, and the database access credentials. CFG_DATABASE_HOST = localhost CFG_DATABASE_PORT = 3306 CFG_DATABASE_NAME = invenio CFG_DATABASE_USER = invenio CFG_DATABASE_PASS = my123p$ss ## CFG_DATABASE_SLAVE - if you use DB replication, then specify the DB ## slave address credentials. (Assuming the same access rights to the ## DB slave as to the DB master.) If you don't use DB replication, ## then leave this option blank. CFG_DATABASE_SLAVE = ## CFG_SITE_URL - specify URL under which your installation will be ## visible. For example, use "http://your.site.com". Do not leave ## trailing slash. CFG_SITE_URL = http://localhost ## CFG_SITE_SECURE_URL - specify secure URL under which your ## installation secure pages such as login or registration will be ## visible. For example, use "https://your.site.com". Do not leave ## trailing slash. If you don't plan on using HTTPS, then you may ## leave this empty. CFG_SITE_SECURE_URL = https://localhost ## CFG_SITE_NAME -- the visible name of your Invenio installation. CFG_SITE_NAME = Atlantis Institute of Fictive Science ## CFG_SITE_NAME_INTL -- the international versions of CFG_SITE_NAME ## in various languages. (See also CFG_SITE_LANGS below.) CFG_SITE_NAME_INTL_en = Atlantis Institute of Fictive Science CFG_SITE_NAME_INTL_fr = Atlantis Institut des Sciences Fictives CFG_SITE_NAME_INTL_de = Atlantis Institut der fiktiven Wissenschaft CFG_SITE_NAME_INTL_es = Atlantis Instituto de la Ciencia Fictive CFG_SITE_NAME_INTL_ca = Institut Atlantis de Ciència Fictícia CFG_SITE_NAME_INTL_pt = Instituto Atlantis de Ciência Fictícia CFG_SITE_NAME_INTL_it = Atlantis Istituto di Scienza Fittizia CFG_SITE_NAME_INTL_ru = Институт Фиктивных Наук Атлантиды CFG_SITE_NAME_INTL_sk = Atlantis Inštitút Fiktívnych Vied CFG_SITE_NAME_INTL_cs = Atlantis Institut Fiktivních Věd CFG_SITE_NAME_INTL_no = Atlantis Institutt for Fiktiv Vitenskap CFG_SITE_NAME_INTL_sv = Atlantis Institut för Fiktiv Vetenskap CFG_SITE_NAME_INTL_el = Ινστιτούτο Φανταστικών Επιστημών Ατλαντίδος CFG_SITE_NAME_INTL_uk = Інститут вигаданих наук в Атлантісі CFG_SITE_NAME_INTL_ja = Fictive 科学のAtlantis の協会 CFG_SITE_NAME_INTL_pl = Instytut Fikcyjnej Nauki Atlantis CFG_SITE_NAME_INTL_bg = Институт за фиктивни науки Атлантис CFG_SITE_NAME_INTL_hr = Institut Fiktivnih Znanosti Atlantis CFG_SITE_NAME_INTL_zh_CN = 阿特兰提斯虚拟科学学院 CFG_SITE_NAME_INTL_zh_TW = 阿特蘭提斯虛擬科學學院 CFG_SITE_NAME_INTL_hu = Kitalált Tudományok Atlantiszi Intézete CFG_SITE_NAME_INTL_af = Atlantis Instituut van Fiktiewe Wetenskap CFG_SITE_NAME_INTL_gl = Instituto Atlantis de Ciencia Fictive CFG_SITE_NAME_INTL_ro = Institutul Atlantis al Ştiinţelor Fictive CFG_SITE_NAME_INTL_rw = Atlantis Ishuri Rikuru Ry'ubuhanga CFG_SITE_NAME_INTL_ka = ატლანტიდის ფიქტიური მეცნიერების ინსტიტუტი CFG_SITE_NAME_INTL_lt = Fiktyvių Mokslų Institutas Atlantis CFG_SITE_NAME_INTL_ar = معهد أطلنطيس للعلوم الافتراضية ## CFG_SITE_LANG -- the default language of the interface: ' CFG_SITE_LANG = en ## CFG_SITE_LANGS -- list of all languages the user interface should ## be available in, separated by commas. The order specified below ## will be respected on the interface pages. A good default would be ## to use the alphabetical order. Currently supported languages ## include Afrikaans, Arabic, Bulgarian, Catalan, Czech, German, Georgian, ## Greek, English, Spanish, French, Croatian, Hungarian, Galician, ## Italian, Japanese, Kinyarwanda, Lithuanian, Norwegian, Polish, ## Portuguese, Romanian, Russian, Slovak, Swedish, Ukrainian, Chinese ## (China), Chinese (Taiwan), so that the eventual maximum you can ## currently select is ## "af,ar,bg,ca,cs,de,el,en,es,fr,hr,gl,ka,it,rw,lt,hu,ja,no,pl,pt,ro,ru,sk,sv,uk,zh_CN,zh_TW". CFG_SITE_LANGS = af,ar,bg,ca,cs,de,el,en,es,fr,hr,gl,ka,it,rw,lt,hu,ja,no,pl,pt,ro,ru,sk,sv,uk,zh_CN,zh_TW ## CFG_SITE_SUPPORT_EMAIL -- the email address of the support team for ## this installation: CFG_SITE_SUPPORT_EMAIL = info@invenio-software.org ## CFG_SITE_ADMIN_EMAIL -- the email address of the 'superuser' for ## this installation. Enter your email address below and login with ## this address when using Invenio inistration modules. You ## will then be automatically recognized as superuser of the system. CFG_SITE_ADMIN_EMAIL = info@invenio-software.org ## CFG_SITE_EMERGENCY_EMAIL_ADDRESSES -- list of email addresses to ## which an email should be sent in case of emergency (e.g. bibsched ## queue has been stopped because of an error). Configuration ## dictionary allows for different recipients based on weekday and ## time-of-day. Example: ## ## CFG_SITE_EMERGENCY_EMAIL_ADDRESSES = { ## 'Sunday 22:00-06:00': '0041761111111@email2sms.foo.com', ## '06:00-18:00': 'team-in-europe@foo.com,0041762222222@email2sms.foo.com', ## '18:00-06:00': 'team-in-usa@foo.com', ## '*': 'john.doe.phone@foo.com'} ## ## If you want the emergency email notifications to always go to the ## same address, just use the wildcard line in the above example. CFG_SITE_EMERGENCY_EMAIL_ADDRESSES = {} ## CFG_SITE_ADMIN_EMAIL_EXCEPTIONS -- set this to 0 if you do not want ## to receive any captured exception via email to CFG_SITE_ADMIN_EMAIL ## address. Captured exceptions will still be available in ## var/log/invenio.err file. Set this to 1 if you want to receive ## some of the captured exceptions (this depends on the actual place ## where the exception is captured). Set this to 2 if you want to ## receive all captured exceptions. CFG_SITE_ADMIN_EMAIL_EXCEPTIONS = 1 ## CFG_SITE_RECORD -- what is the URI part representing detailed ## record pages? We recomment to leave the default value `record' ## unchanged. CFG_SITE_RECORD = record ## CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER -- set this to ## the number of seconds after which to reset the exception notification ## counter. A given repetitive exception is notified via email with a ## logarithmic strategy: the first time it is seen it is sent via email, ## then the second time, then the fourth, then the eighth and so forth. ## If the number of seconds elapsed since the last time it was notified ## is greater than CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER ## then the internal counter is reset in order not to have exception ## notification become more and more rare. CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER = 14400 ## CFG_CERN_SITE -- do we want to enable CERN-specific code? ## Put "1" for "yes" and "0" for "no". CFG_CERN_SITE = 0 ## CFG_INSPIRE_SITE -- do we want to enable INSPIRE-specific code? ## Put "1" for "yes" and "0" for "no". CFG_INSPIRE_SITE = 0 ## CFG_ADS_SITE -- do we want to enable ADS-specific code? ## Put "1" for "yes" and "0" for "no". CFG_ADS_SITE = 0 ## CFG_OPENAIRE_SITE -- do we want to enable OpenAIRE-specific code? ## Put "1" for "yes" and "0" for "no". CFG_OPENAIRE_SITE = 0 ## Now you can tune whether to integrate with external authentication providers ## through the OpenID and OAuth protocols. ## The following variables let you fine-tune which authentication ## providers you want to authorize. You can override here most of ## the variables in lib/invenio/access_control_config.py. ## In particular you can put in these variable the consumer_key and ## consumer_secret of the desired services. ## Note: some providers don't supply an mail address. ## If you choose them, the users will be registered with a temporary email address. ## CFG_OPENID_PROVIDERS -- Comma-separated list of providers you want to enable ## through the OpenID protocol. ## E.g.: CFG_OPENID_PROVIDERS = google,yahoo,aol,wordpress,myvidoop,openid,verisign,myopenid,myspace,livejournal,blogger CFG_OPENID_PROVIDERS = ## CFG_OAUTH1_PROVIDERS -- Comma-separated list of providers you want to enable ## through the OAuth1 protocol. ## Note: OAuth1 is in general deprecated in favour of OAuth2. ## E.g.: CFG_OAUTH1_PROVIDERS = twitter,linkedin,flickr, CFG_OAUTH1_PROVIDERS = ## CFG_OAUTH2_PROVIDERS -- Comma-separated list of providers you want to enable ## through the OAuth2 protocol. ## Note: if you enable the "orcid" provider the full profile of the user ## in Orcid will be imported. ## E.g.: CFG_OAUTH2_PROVIDERS = facebook,yammer,foursquare,googleoauth2,instagram,orcid CFG_OAUTH2_PROVIDERS = ## CFG_OPENID_CONFIGURATIONS -- Mapping of special parameter to configure the ## desired OpenID providers. Use this variable to override out-of-the-box ## parameters already set in lib/python/invenio/access_control_config.py. ## E.g.: CFG_OPENID_CONFIGURATIONS = {'google': { ## 'identifier': 'https://www.google.com/accounts/o8/id', ## 'trust_email': True}} CFG_OPENID_CONFIGURATIONS = {} ## CFG_OAUTH1_CONFIGURATIONS -- Mapping of special parameter to configure the ## desired OAuth1 providers. Use this variable to override out-of-the-box ## parameters already set in lib/python/invenio/access_control_config.py. ## E.g.: CFG_OAUTH1_CONFIGURATIONS = {'linkedin': { ## 'consumer_key' : 'MY_LINKEDIN_CONSUMER_KEY', ## 'consumer_secret' : 'MY_LINKEDIN_CONSUMER_SECRET'}} CFG_OAUTH1_CONFIGURATIONS = {} ## CFG_OAUTH2_CONFIGURATIONS -- Mapping of special parameter to configure the ## desired OAuth2 providers. Use this variable to override out-of-the-box ## parameters already set in lib/python/invenio/access_control_config.py. ## E.g.: CFG_OAUTH2_CONFIGURATIONS = {'orcid': { ## 'consumer_key' : 'MY_ORCID_CONSUMER_KEY', ## 'consumer_secret' : 'MY_ORCID_CONSUMER_SECRET'}} CFG_OAUTH2_CONFIGURATIONS = {} ## CFG_DEVEL_SITE -- is this a development site? If it is, you might ## prefer that it does not do certain things. For example, you might ## not want WebSubmit to send certain emails or trigger certain ## processes on a development site. ## Put "1" for "yes" (this is a development site) or "0" for "no" ## (this isn't a development site.) CFG_DEVEL_SITE = 0 ################################ ## Part 2: Web page style ## ################################ ## The variables affecting the page style. The most important one is ## the 'template skin' you would like to use and the obfuscation mode ## for your email addresses. Please refer to the WebStyle Admin Guide ## for more explanation. The other variables are listed here mostly ## for backwards compatibility purposes only. ## CFG_WEBSTYLE_TEMPLATE_SKIN -- what template skin do you want to ## use? CFG_WEBSTYLE_TEMPLATE_SKIN = default ## CFG_WEBSTYLE_EMAIL_ADDRESSES_OBFUSCATION_MODE. How do we "protect" ## email addresses from undesired automated email harvesters? This ## setting will not affect 'support' and 'admin' emails. ## NOTE: there is no ultimate solution to protect against email ## harvesting. All have drawbacks and can more or less be ## circumvented. Choose you preferred mode ([t] means "transparent" ## for the user): ## -1: hide all emails. ## [t] 0 : no protection, email returned as is. ## foo@example.com => foo@example.com ## 1 : basic email munging: replaces @ by [at] and . by [dot] ## foo@example.com => foo [at] example [dot] com ## [t] 2 : transparent name mangling: characters are replaced by ## equivalent HTML entities. ## foo@example.com => foo@example.com ## [t] 3 : javascript insertion. Requires Javascript enabled on client ## side. ## 4 : replaces @ and . characters by gif equivalents. ## foo@example.com => foo [at] example [dot] com CFG_WEBSTYLE_EMAIL_ADDRESSES_OBFUSCATION_MODE = 2 ## CFG_WEBSTYLE_INSPECT_TEMPLATES -- Do we want to debug all template ## functions so that they would return HTML results wrapped in ## comments indicating which part of HTML page was created by which ## template function? Useful only for debugging Pythonic HTML ## template. See WebStyle Admin Guide for more information. CFG_WEBSTYLE_INSPECT_TEMPLATES = 0 ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXLEFTTOP -- eventual global HTML ## left top box: CFG_WEBSTYLE_CDSPAGEBOXLEFTTOP = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXLEFTBOTTOM -- eventual global ## HTML left bottom box: CFG_WEBSTYLE_CDSPAGEBOXLEFTBOTTOM = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXRIGHTTOP -- eventual global ## HTML right top box: CFG_WEBSTYLE_CDSPAGEBOXRIGHTTOP = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXRIGHTBOTTOM -- eventual global ## HTML right bottom box: CFG_WEBSTYLE_CDSPAGEBOXRIGHTBOTTOM = ## CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST -- when certain HTTP status ## codes are raised to the WSGI handler, the corresponding exceptions ## and error messages can be sent to the system administrator for ## inspecting. This is useful to detect and correct errors. The ## variable represents a comma-separated list of HTTP statuses that ## should alert admin. Wildcards are possible. If the status is ## followed by an "r", it means that a referer is required to exist ## (useful to distinguish broken known links from URL typos when 404 ## errors are raised). CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST = 404r,400,5*,41* ## CFG_WEBSTYLE_HTTP_USE_COMPRESSION -- whether to enable deflate ## compression of your HTTP/HTTPS connections. This will affect the Apache ## configuration snippets created by inveniocfg --create-apache-conf and ## the OAI-PMH Identify response. CFG_WEBSTYLE_HTTP_USE_COMPRESSION = 0 ## CFG_WEBSTYLE_REVERSE_PROXY_IPS -- if you are setting a multinode ## environment where an HTTP proxy such as mod_proxy is sitting in ## front of the Invenio web application and is forwarding requests to ## worker nodes, set here the the list of IP addresses of the allowed ## HTTP proxies. This is needed in order to avoid IP address spoofing ## when worker nodes are also available on the public Internet and ## might receive forged HTTP requests. Only HTTP requests coming from ## the specified IP addresses will be considered as forwarded from a ## reverse proxy. E.g. set this to '123.123.123.123'. CFG_WEBSTYLE_REVERSE_PROXY_IPS = ################################## ## Part 3: WebSearch parameters ## ################################## ## This section contains some configuration parameters for WebSearch ## module. Please note that WebSearch is mostly configured on ## run-time via its WebSearch Admin web interface. The parameters ## below are the ones that you do not probably want to modify very ## often during the runtime. (Note that you may modify them ## afterwards too, though.) ## CFG_WEBSEARCH_SEARCH_CACHE_SIZE -- how many queries we want to ## cache in memory per one Apache httpd process? This cache is used ## mainly for "next/previous page" functionality, but it caches also ## "popular" user queries if more than one user happen to search for ## the same thing. Note that large numbers may lead to great memory ## consumption. We recommend a value not greater than 100. CFG_WEBSEARCH_SEARCH_CACHE_SIZE = 0 ## CFG_WEBSEARCH_FIELDS_CONVERT -- if you migrate from an older ## system, you may want to map field codes of your old system (such as ## 'ti') to Invenio/MySQL ("title"). Use Python dictionary syntax ## for the translation table, e.g. {'wau':'author', 'wti':'title'}. ## Usually you don't want to do that, and you would use empty dict {}. CFG_WEBSEARCH_FIELDS_CONVERT = {} ## CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH -- width of the ## search pattern window in the light search interface, in ## characters. CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH = 60 CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH = 60 ## CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH -- width of the search ## pattern window in the simple search interface, in characters. CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH = 40 ## CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH -- width of the ## search pattern window in the advanced search interface, in ## characters. CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH = 30 ## CFG_WEBSEARCH_NB_RECORDS_TO_SORT -- how many records do we still ## want to sort? For higher numbers we print only a warning and won't ## perform any sorting other than default 'latest records first', as ## sorting would be very time consuming then. We recommend a value of ## not more than a couple of thousands. CFG_WEBSEARCH_NB_RECORDS_TO_SORT = 1000 ## CFG_WEBSEARCH_CALL_BIBFORMAT -- if a record is being displayed but ## it was not preformatted in the "HTML brief" format, do we want to ## call BibFormatting on the fly? Put "1" for "yes" and "0" for "no". ## Note that "1" will display the record exactly as if it were fully ## preformatted, but it may be slow due to on-the-fly processing; "0" ## will display a default format very fast, but it may not have all ## the fields as in the fully preformatted HTML brief format. Note ## also that this option is active only for old (PHP) formats; the new ## (Python) formats are called on the fly by default anyway, since ## they are much faster. When usure, please set "0" here. CFG_WEBSEARCH_CALL_BIBFORMAT = 0 ## CFG_WEBSEARCH_USE_ALEPH_SYSNOS -- do we want to make old SYSNOs ## visible rather than MySQL's record IDs? You may use this if you ## migrate from a different e-doc system, and you store your old ## system numbers into 970__a. Put "1" for "yes" and "0" for ## "no". Usually you don't want to do that, though. CFG_WEBSEARCH_USE_ALEPH_SYSNOS = 0 ## CFG_WEBSEARCH_I18N_LATEST_ADDITIONS -- Put "1" if you want the ## "Latest Additions" in the web collection pages to show ## internationalized records. Useful only if your brief BibFormat ## templates contains internationalized strings. Otherwise put "0" in ## order not to slow down the creation of latest additions by WebColl. CFG_WEBSEARCH_I18N_LATEST_ADDITIONS = 0 ## CFG_WEBSEARCH_INSTANT_BROWSE -- the number of records to display ## under 'Latest Additions' in the web collection pages. CFG_WEBSEARCH_INSTANT_BROWSE = 10 ## CFG_WEBSEARCH_INSTANT_BROWSE_RSS -- the number of records to ## display in the RSS feed. CFG_WEBSEARCH_INSTANT_BROWSE_RSS = 25 ## CFG_WEBSEARCH_RSS_I18N_COLLECTIONS -- comma-separated list of ## collections that feature an internationalized RSS feed on their ## main seach interface page created by webcoll. Other collections ## will have RSS feed using CFG_SITE_LANG. CFG_WEBSEARCH_RSS_I18N_COLLECTIONS = ## CFG_WEBSEARCH_RSS_TTL -- number of minutes that indicates how long ## a feed cache is valid. CFG_WEBSEARCH_RSS_TTL = 360 ## CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS -- maximum number of request kept ## in cache. If the cache is filled, following request are not cached. CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS = 1000 ## CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD -- up to how many author names ## to print explicitely; for more print "et al". Note that this is ## used in default formatting that is seldomly used, as usually ## BibFormat defines all the format. The value below is only used ## when BibFormat fails, for example. CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD = 3 ## CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS -- whether to show or ## not collection grandsons in Narrow Search boxes (sons are shown by ## default, grandsons are configurable here). Use 0 for no and 1 for ## yes. CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS = 1 ## CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX -- shall we ## create help links for Ellis, Nick or Ellis, Nicholas and friends ## when Ellis, N was searched for? Useful if you have one author ## stored in the database under several name formats, namely surname ## comma firstname and surname comma initial cataloging policy. Use 0 ## for no and 1 for yes. CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX = 1 ## CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS -- MathJax is a JavaScript ## library that renders (La)TeX mathematical formulas in the client ## browser. This parameter must contain a comma-separated list of ## output formats for which to apply the MathJax rendering, for example ## "hb,hd". If the list is empty, MathJax is disabled. ## See also CFG_WEBSUBMIT_USE_MATHJAX CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS = ## CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT -- when searching ## external collections (e.g. SPIRES, CiteSeer, etc), how many seconds ## do we wait for reply before abandonning? CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT = 5 ## CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_MAXRESULTS -- how many ## results do we fetch? CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_MAXRESULTS = 10 ## CFG_WEBSEARCH_SPLIT_BY_COLLECTION -- do we want to split the search ## results by collection or not? Use 0 for not, 1 for yes. CFG_WEBSEARCH_SPLIT_BY_COLLECTION = 1 ## CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS -- the default number of ## records to display per page in the search results pages. CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS = 10 ## CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS -- in order to limit denial of ## service attacks the total number of records per group displayed as a ## result of a search query will be limited to this number. Only the superuser ## queries will not be affected by this limit. CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS = 200 ## CFG_WEBSEARCH_SHOW_COMMENT_COUNT -- do we want to show the 'N comments' ## links on the search engine pages? (useful only when you have allowed ## commenting) CFG_WEBSEARCH_SHOW_COMMENT_COUNT = 1 ## CFG_WEBSEARCH_SHOW_REVIEW_COUNT -- do we want to show the 'N reviews' ## links on the search engine pages? (useful only when you have allowed ## reviewing) CFG_WEBSEARCH_SHOW_REVIEW_COUNT = 1 ## CFG_WEBSEARCH_FULLTEXT_SNIPPETS -- how many full-text snippets do ## we want to display for full-text searches? If you want to specify ## different values for different document status types, please add ## more items into this dictionary. (Unless specified, the empty ## value will be used as default.) This is useful if you have ## restricted files of different types with various restrictions on ## what we can show. CFG_WEBSEARCH_FULLTEXT_SNIPPETS = { '': 4, } ## CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS -- what is the maximum size ## of a snippet to display around the pattern found in the full-text? ## If you want to specify different values for different document ## status types, please add more items into this dictionary. (Unless ## specified, the empty value will be used as default.) This is ## useful if you have restricted files of different types with various ## restrictions on what we can show. CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS = { '': 100, } ## CFG_WEBSEARCH_WILDCARD_LIMIT -- some of the queries, wildcard ## queries in particular (ex: cern*, a*), but also regular expressions ## (ex: [a-z]+), may take a long time to respond due to the high ## number of hits. You can limit the number of terms matched by a ## wildcard by setting this variable. A negative value or zero means ## that none of the queries will be limited (which may be wanted by ## also prone to denial-of-service kind of attacks). CFG_WEBSEARCH_WILDCARD_LIMIT = 50000 ## CFG_WEBSEARCH_SYNONYM_KBRS -- defines which knowledge bases are to ## be used for which index in order to provide runtime synonym lookup ## of user-supplied terms, and what massaging function should be used ## upon search pattern before performing the KB lookup. (Can be one ## of `exact', 'leading_to_comma', `leading_to_number'.) CFG_WEBSEARCH_SYNONYM_KBRS = { 'journal': ['SEARCH-SYNONYM-JOURNAL', 'leading_to_number'], } ## CFG_SOLR_URL -- optionally, you may use Solr to serve full-text ## queries and ranking. If so, please specify the URL of your Solr instance. ## Example: http://localhost:8983/solr (default solr port) CFG_SOLR_URL = ## CFG_XAPIAN_ENABLED -- optionally, you may use Xapian to serve full-text ## queries and ranking. If so, please enable it: 1 = enabled CFG_XAPIAN_ENABLED = ## CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT -- specify the limit when ## the previous/next/back hit links are to be displayed on detailed record pages. ## In order to speeding up list manipulations, if a search returns lots of hits, ## more than this limit, then do not loose time calculating next/previous/back ## hits at all, but display page directly without these. ## Note also that Invenio installations that do not like ## to have the next/previous hit link functionality would be able to set this ## variable to zero and not see anything. CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT = 1000 ## CFG_WEBSEARCH_PREV_NEXT_HIT_FOR_GUESTS -- Set this to 0 if you want ## to disable the previous/next/back hit link functionality for guests ## users. ## Since the previous/next/back hit link functionality is causing the allocation ## of user session in the database even for guests users, it might be useful to ## be able to disable it e.g. when your site is bombarded by web request ## (a.k.a. Slashdot effect). CFG_WEBSEARCH_PREV_NEXT_HIT_FOR_GUESTS = 1 ## CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY -- when a record belongs to more than one ## restricted collection, if the viewrestcoll policy is set to "ALL" (default) ## then the user must be authorized to all the restricted collections, in ## order to be granted access to the specific record. If the policy is set to ## "ANY", then the user need to be authorized to only one of the collections ## in order to be granted access to the specific record. CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY = ANY ## CFG_WEBSEARCH_SPIRES_SYNTAX -- variable to configure the use of the ## SPIRES query syntax in searches. Values: 0 = SPIRES syntax is ## switched off; 1 = leading 'find' is required; 9 = leading 'find' is ## not required (leading SPIRES operator, space-operator-space, etc ## are also accepted). CFG_WEBSEARCH_SPIRES_SYNTAX = 1 ## CFG_WEBSEARCH_DISPLAY_NEAREST_TERMS -- when user search does not ## return any direct result, what do we want to display? Set to 0 in ## order to display a generic message about search returning no hits. ## Set to 1 in order to display list of nearest terms from the indexes ## that may match user query. Note: this functionality may be slow, ## so you may want to disable it on bigger sites. CFG_WEBSEARCH_DISPLAY_NEAREST_TERMS = 1 ## CFG_WEBSEARCH_DETAILED_META_FORMAT -- the output format to use for ## detailed meta tags containing metadata as configured in the tag ## table. Default output format should be 'hdm', included. This ## format will be included in the header of /record/ pages. For ## efficiency this format should be pre-cached with BibReformat. See ## also CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR and ## CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR. CFG_WEBSEARCH_DETAILED_META_FORMAT = hdm ## CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR -- decides if meta tags for ## Google Scholar shall be included in the detailed record page ## header, when using the standard formatting templates/elements. See ## also CFG_WEBSEARCH_DETAILED_META_FORMAT and ## CFG_WEBSEARCH_ENABLE_OPENGRAPH. When this variable is changed and ## output format defined in CFG_WEBSEARCH_DETAILED_META_FORMAT is ## cached, a bibreformat must be run for the cached records. CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR = True ## CFG_WEBSEARCH_ENABLE_OPENGRAPH -- decides if meta tags for the Open ## Graph protocol shall be included in the detailed record page ## header, when using the standard formatting templates/elements. See ## also CFG_WEBSEARCH_DETAILED_META_FORMAT and ## CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR. When this variable is changed ## and output format defined in CFG_WEBSEARCH_DETAILED_META_FORMAT is ## cached, a bibreformat must be run for the cached records. Note that ## enabling Open Graph produces invalid XHTML/HTML5 markup. CFG_WEBSEARCH_ENABLE_OPENGRAPH = False ## CFG_WEBSEARCH_CITESUMMARY_SELFCITES_THRESHOLD -- switches off ## self-citations computation if the number records in the citesummary ## is above the threshold CFG_WEBSEARCH_CITESUMMARY_SELFCITES_THRESHOLD = 2000 ####################################### ## Part 4: BibHarvest OAI parameters ## ####################################### ## This part defines parameters for the Invenio OAI gateway. ## Useful if you are running Invenio as OAI data provider. ## CFG_OAI_ID_FIELD -- OAI identifier MARC field: CFG_OAI_ID_FIELD = 909COo ## CFG_OAI_SET_FIELD -- OAI set MARC field: CFG_OAI_SET_FIELD = 909COp ## CFG_OAI_SET_FIELD -- previous OAI set MARC field: CFG_OAI_PREVIOUS_SET_FIELD = 909COq ## CFG_OAI_DELETED_POLICY -- OAI deletedrecordspolicy ## (no/transient/persistent): CFG_OAI_DELETED_POLICY = persistent ## CFG_OAI_ID_PREFIX -- OAI identifier prefix: CFG_OAI_ID_PREFIX = atlantis.cern.ch ## CFG_OAI_SAMPLE_IDENTIFIER -- OAI sample identifier: CFG_OAI_SAMPLE_IDENTIFIER = oai:atlantis.cern.ch:123 ## CFG_OAI_IDENTIFY_DESCRIPTION -- description for the OAI Identify verb: CFG_OAI_IDENTIFY_DESCRIPTION = %(CFG_SITE_URL)s Free and unlimited use by anybody with obligation to refer to original record Full content, i.e. preprints may not be harvested by robots Submission restricted. Submitted documents are subject of approval by OAI repository admins. ## CFG_OAI_LOAD -- OAI number of records in a response: CFG_OAI_LOAD = 500 ## CFG_OAI_EXPIRE -- OAI resumptionToken expiration time: CFG_OAI_EXPIRE = 90000 ## CFG_OAI_SLEEP -- service unavailable between two consecutive ## requests for CFG_OAI_SLEEP seconds: CFG_OAI_SLEEP = 2 ## CFG_OAI_METADATA_FORMATS -- mapping between accepted metadataPrefixes and ## the corresponding output format to use, its schema and its metadataNamespace. CFG_OAI_METADATA_FORMATS = { 'marcxml': ('XOAIMARC', 'http://www.openarchives.org/OAI/1.1/dc.xsd', 'http://purl.org/dc/elements/1.1/'), 'oai_dc': ('XOAIDC', 'http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd', 'http://www.loc.gov/MARC21/slim'), } ## CFG_OAI_FRIENDS -- list of OAI baseURL of friend repositories. See: ## CFG_OAI_FRIENDS = http://cds.cern.ch/oai2d,http://openaire.cern.ch/oai2d,http://export.arxiv.org/oai2 ## The following subfields are a completition to ## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. If CFG_OAI_PROVENANCE_BASEURL_SUBFIELD is ## set for a record, then the corresponding field is considered has being ## harvested via OAI-PMH ## CFG_OAI_PROVENANCE_BASEURL_SUBFIELD -- baseURL of the originDescription or a ## record CFG_OAI_PROVENANCE_BASEURL_SUBFIELD = u ## CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD -- datestamp of the originDescription ## or a record CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD = d ## CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD -- metadataNamespace of the ## originDescription or a record CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD = m ## CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD -- originDescription of the ## originDescription or a record CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD = d ## CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD -- harvestDate of the ## originDescription or a record CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD = h ## CFG_OAI_PROVENANCE_ALTERED_SUBFIELD -- altered flag of the ## originDescription or a record CFG_OAI_PROVENANCE_ALTERED_SUBFIELD = t ## CFG_OAI_FAILED_HARVESTING_STOP_QUEUE -- when harvesting OAI sources ## fails, shall we report an error with the task and stop BibSched ## queue, or simply wait for the next run of the task? A value of 0 ## will stop the task upon errors, 1 will let the queue run if the ## next run of the oaiharvest task can safely recover the failure ## (this means that the queue will stop if the task is not set to run ## periodically) CFG_OAI_FAILED_HARVESTING_STOP_QUEUE = 1 ## CFG_OAI_FAILED_HARVESTING_EMAILS_ADMIN -- when ## CFG_OAI_FAILED_HARVESTING_STOP_QUEUE is set to leave the queue ## running upon errors, shall we send an email to admin to notify ## about the failure? CFG_OAI_FAILED_HARVESTING_EMAILS_ADMIN = True ## NOTE: the following parameters are experimenta ## ----------------------------------------------------------------------------- ## CFG_OAI_RIGHTS_FIELD -- MARC field dedicated to storing Copyright information CFG_OAI_RIGHTS_FIELD = 542__ ## CFG_OAI_RIGHTS_HOLDER_SUBFIELD -- MARC subfield dedicated to storing the ## Copyright holder information CFG_OAI_RIGHTS_HOLDER_SUBFIELD = d ## CFG_OAI_RIGHTS_DATE_SUBFIELD -- MARC subfield dedicated to storing the ## Copyright date information CFG_OAI_RIGHTS_DATE_SUBFIELD = g ## CFG_OAI_RIGHTS_URI_SUBFIELD -- MARC subfield dedicated to storing the URI ## (URL or URN, more detailed statement about copyright status) information CFG_OAI_RIGHTS_URI_SUBFIELD = u ## CFG_OAI_RIGHTS_CONTACT_SUBFIELD -- MARC subfield dedicated to storing the ## Copyright holder contact information CFG_OAI_RIGHTS_CONTACT_SUBFIELD = e ## CFG_OAI_RIGHTS_STATEMENT_SUBFIELD -- MARC subfield dedicated to storing the ## Copyright statement as presented on the resource CFG_OAI_RIGHTS_STATEMENT_SUBFIELD = f ## CFG_OAI_LICENSE_FIELD -- MARC field dedicated to storing terms governing ## use and reproduction (license) CFG_OAI_LICENSE_FIELD = 540__ ## CFG_OAI_LICENSE_TERMS_SUBFIELD -- MARC subfield dedicated to storing the ## Terms governing use and reproduction, e.g. CC License CFG_OAI_LICENSE_TERMS_SUBFIELD = a ## CFG_OAI_LICENSE_PUBLISHER_SUBFIELD -- MARC subfield dedicated to storing the ## person or institution imposing the license (author, publisher) CFG_OAI_LICENSE_PUBLISHER_SUBFIELD = b ## CFG_OAI_LICENSE_URI_SUBFIELD -- MARC subfield dedicated to storing the URI ## URI CFG_OAI_LICENSE_URI_SUBFIELD = u ##------------------------------------------------------------------------------ ################################### ## Part 5: BibDocFile parameters ## ################################### ## This section contains some configuration parameters for BibDocFile ## module. ## CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_DOCTYPES -- this is the list of ## doctypes (like 'Main' or 'Additional') and their description that admins ## can choose from when adding new files via the Document File Manager ## admin interface. ## - When no value is provided, admins cannot add new ## file (they can only revise/delete/add format) ## - When a single value is given, it is used as ## default doctype for all new documents ## ## Order is relevant ## Eg: ## [('main', 'Main document'), ('additional', 'Figure, schema. etc')] CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_DOCTYPES = [ ('Main', 'Main document'), ('LaTeX', 'LaTeX'), ('Source', 'Source'), ('Additional', 'Additional File'), ('Audio', 'Audio file'), ('Video', 'Video file'), ('Script', 'Script'), ('Data', 'Data'), ('Figure', 'Figure'), ('Schema', 'Schema'), ('Graph', 'Graph'), ('Image', 'Image'), ('Drawing', 'Drawing'), ('Slides', 'Slides')] ## CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_RESTRICTIONS -- this is the ## list of restrictions (like 'Restricted' or 'No Restriction') and their ## description that admins can choose from when adding or revising files. ## Restrictions can then be configured at the level of WebAccess. ## - When no value is provided, no restriction is ## applied ## - When a single value is given, it is used as ## default resctriction for all documents. ## - The first value of the list is used as default ## restriction if the user if not given the ## choice of the restriction. Order is relevant ## ## Eg: ## [('', 'No restriction'), ('restr', 'Restricted')] CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_RESTRICTIONS = [ ('', 'Public'), ('restricted', 'Restricted')] ## CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_MISC -- set here the other ## default flags and attributes to tune the Document File Manager admin ## interface. ## See the docstring of bibdocfile_managedocfiles.create_file_upload_interface ## to have a description of the available parameters and their syntax. ## In general you will rarely need to change this variable. CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_MISC = { 'can_revise_doctypes': ['*'], 'can_comment_doctypes': ['*'], 'can_describe_doctypes': ['*'], 'can_delete_doctypes': ['*'], 'can_keep_doctypes': ['*'], 'can_rename_doctypes': ['*'], 'can_add_format_to_doctypes': ['*'], 'can_restrict_doctypes': ['*'], } ## CFG_BIBDOCFILE_FILESYSTEM_BIBDOC_GROUP_LIMIT -- the fulltext ## documents are stored under "/opt/invenio/var/data/files/gX/Y" ## directories where X is 0,1,... and Y stands for bibdoc ID. Thusly ## documents Y are grouped into directories X and this variable ## indicates the maximum number of documents Y stored in each ## directory X. This limit is imposed solely for filesystem ## performance reasons in order not to have too many subdirectories in ## a given directory. CFG_BIBDOCFILE_FILESYSTEM_BIBDOC_GROUP_LIMIT = 5000 ## CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS -- a comma-separated ## list of document extensions not listed in Python standard mimetype ## library that should be recognized by Invenio. CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS = hpg,link,lis,llb,mat,mpp,msg,docx,docm,xlsx,xlsm,xlsb,pptx,pptm,ppsx,ppsm +## CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES -- a mapping of additional +## mimetypes that could be served or have to be recognized by this instance +## of Invenio (this is useful in order to patch old versions of the +## mimetypes Python module). +CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES = { + "application/xml-dtd": ".dtd", + } + ## CFG_BIBDOCFILE_DESIRED_CONVERSIONS -- a dictionary having as keys ## a format and as values the corresponding list of desired converted ## formats. CFG_BIBDOCFILE_DESIRED_CONVERSIONS = { 'pdf' : ('pdf;pdfa', ), 'ps.gz' : ('pdf;pdfa', ), 'djvu' : ('pdf', ), 'sxw': ('doc', 'odt', 'pdf;pdfa', ), 'docx' : ('doc', 'odt', 'pdf;pdfa', ), 'doc' : ('odt', 'pdf;pdfa', 'docx'), 'rtf' : ('pdf;pdfa', 'odt', ), 'odt' : ('pdf;pdfa', 'doc', ), 'pptx' : ('ppt', 'odp', 'pdf;pdfa', ), 'ppt' : ('odp', 'pdf;pdfa', 'pptx'), 'sxi': ('odp', 'pdf;pdfa', ), 'odp' : ('pdf;pdfa', 'ppt', ), 'xlsx' : ('xls', 'ods', 'csv'), 'xls' : ('ods', 'csv'), 'ods' : ('xls', 'xlsx', 'csv'), 'sxc': ('xls', 'xlsx', 'csv'), 'tiff' : ('pdf;pdfa', ), 'tif' : ('pdf;pdfa', ),} ## CFG_BIBDOCFILE_USE_XSENDFILE -- if your web server supports ## XSendfile header, you may want to enable this feature in order for ## to Invenio tell the web server to stream files for download (after ## proper authorization checks) by web server's means. This helps to ## liberate Invenio worker processes from being busy with sending big ## files to clients. The web server will take care of that. Note: ## this feature is still somewhat experimental. Note: when enabled ## (set to 1), then you have to also regenerate Apache vhost conf ## snippets (inveniocfg --update-config-py --create-apache-conf). CFG_BIBDOCFILE_USE_XSENDFILE = 0 ## CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY -- a number between 0 and ## 1 that indicates probability with which MD5 checksum will be ## verified when streaming bibdocfile-managed files. (0.1 will cause ## the check to be performed once for every 10 downloads) CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY = 0.1 ## CFG_BIBDOCFILE_BEST_FORMATS_TO_EXTRACT_TEXT_FROM -- a comma-separated ## list of document extensions in decrescent order of preference ## to suggest what is considered the best format to extract text from. CFG_BIBDOCFILE_BEST_FORMATS_TO_EXTRACT_TEXT_FROM = ('txt', 'html', 'xml', 'odt', 'doc', 'docx', 'djvu', 'pdf', 'ps', 'ps.gz') ## CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE -- whether to use the ## database table bibdocfsinfo as reference for filesystem ## information. The default is 0. Switch this to 1 ## after you have run bibdocfile --fix-bibdocfsinfo-cache ## or on an empty system. CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE = 0 ## CFG_OPENOFFICE_SERVER_HOST -- the host where an OpenOffice Server is ## listening to. If localhost an OpenOffice server will be started ## automatically if it is not already running. ## Note: if you set this to an empty value this will disable the usage of ## OpenOffice for converting documents. ## If you set this to something different than localhost you'll have to take ## care to have an OpenOffice server running on the corresponding host and ## to install the same OpenOffice release both on the client and on the server ## side. ## In order to launch an OpenOffice server on a remote machine, just start ## the usual 'soffice' executable in this way: ## $> soffice -headless -nologo -nodefault -norestore -nofirststartwizard \ ## .. -accept=socket,host=HOST,port=PORT;urp;StarOffice.ComponentContext CFG_OPENOFFICE_SERVER_HOST = localhost ## CFG_OPENOFFICE_SERVER_PORT -- the port where an OpenOffice Server is ## listening to. CFG_OPENOFFICE_SERVER_PORT = 2002 ## CFG_OPENOFFICE_USER -- the user that will be used to launch the OpenOffice ## client. It is recommended to set this to a user who don't own files, like ## e.g. 'nobody'. You should also authorize your Apache server user to be ## able to become this user, e.g. by adding to your /etc/sudoers the following ## line: ## "apache ALL=(nobody) NOPASSWD: ALL" ## provided that apache is the username corresponding to the Apache user. ## On some machine this might be apache2 or www-data. CFG_OPENOFFICE_USER = nobody ################################# ## Part 6: BibIndex parameters ## ################################# ## This section contains some configuration parameters for BibIndex ## module. Please note that BibIndex is mostly configured on run-time ## via its BibIndex Admin web interface. The parameters below are the ## ones that you do not probably want to modify very often during the ## runtime. ## CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY -- when fulltext indexing, do ## you want to index locally stored files only, or also external URLs? ## Use "0" to say "no" and "1" to say "yes". CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY = 1 ## CFG_BIBINDEX_REMOVE_STOPWORDS -- when indexing, do we want to remove ## stopwords? Use "0" to say "no" and "1" to say "yes". CFG_BIBINDEX_REMOVE_STOPWORDS = 0 ## CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS -- characters considered as ## alphanumeric separators of word-blocks inside words. You probably ## don't want to change this. CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS = \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ ## CFG_BIBINDEX_CHARS_PUNCTUATION -- characters considered as punctuation ## between word-blocks inside words. You probably don't want to ## change this. CFG_BIBINDEX_CHARS_PUNCTUATION = \.\,\:\;\?\!\" ## CFG_BIBINDEX_REMOVE_HTML_MARKUP -- should we attempt to remove HTML markup ## before indexing? Use 1 if you have HTML markup inside metadata ## (e.g. in abstracts), use 0 otherwise. CFG_BIBINDEX_REMOVE_HTML_MARKUP = 0 ## CFG_BIBINDEX_REMOVE_LATEX_MARKUP -- should we attempt to remove LATEX markup ## before indexing? Use 1 if you have LATEX markup inside metadata ## (e.g. in abstracts), use 0 otherwise. CFG_BIBINDEX_REMOVE_LATEX_MARKUP = 0 ## CFG_BIBINDEX_MIN_WORD_LENGTH -- minimum word length allowed to be added to ## index. The terms smaller then this amount will be discarded. ## Useful to keep the database clean, however you can safely leave ## this value on 0 for up to 1,000,000 documents. CFG_BIBINDEX_MIN_WORD_LENGTH = 0 ## CFG_BIBINDEX_URLOPENER_USERNAME and CFG_BIBINDEX_URLOPENER_PASSWORD -- ## access credentials to access restricted URLs, interesting only if ## you are fulltext-indexing files located on a remote server that is ## only available via username/password. But it's probably better to ## handle this case via IP or some convention; the current scheme is ## mostly there for demo only. CFG_BIBINDEX_URLOPENER_USERNAME = mysuperuser CFG_BIBINDEX_URLOPENER_PASSWORD = mysuperpass ## CFG_INTBITSET_ENABLE_SANITY_CHECKS -- ## Enable sanity checks for integers passed to the intbitset data ## structures. It is good to enable this during debugging ## and to disable this value for speed improvements. CFG_INTBITSET_ENABLE_SANITY_CHECKS = False ## CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES -- regular expression that matches ## docnames for which OCR is desired (set this to .* in order to enable ## OCR in general, set this to empty in order to disable it.) CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES = scan-.* ## CFG_BIBINDEX_SPLASH_PAGES -- key-value mapping where the key corresponds ## to a regular expression that matches the URLs of the splash pages of ## a given service and the value is a regular expression of the set of URLs ## referenced via tags in the HTML content of the splash pages that are ## referring to documents that need to be indexed. ## NOTE: for backward compatibility reasons you can set this to a simple ## regular expression that will directly be used as the unique key of the ## map, with corresponding value set to ".*" (in order to match any URL) CFG_BIBINDEX_SPLASH_PAGES = { "http://documents\.cern\.ch/setlink\?.*": ".*", "http://ilcagenda\.linearcollider\.org/subContributionDisplay\.py\?.*|http://ilcagenda\.linearcollider\.org/contributionDisplay\.py\?.*": "http://ilcagenda\.linearcollider\.org/getFile\.py/access\?.*|http://ilcagenda\.linearcollider\.org/materialDisplay\.py\?.*", } ## CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES -- do we want ## the author word index to exclude first names to keep only last ## names? If set to True, then for the author `Bernard, Denis', only ## `Bernard' will be indexed in the word index, not `Denis'. Note ## that if you change this variable, you have to re-index the author ## index via `bibindex -w author -R'. CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES = False ## CFG_BIBINDEX_SYNONYM_KBRS -- defines which knowledge bases are to ## be used for which index in order to provide index-time synonym ## lookup, and what massaging function should be used upon search ## pattern before performing the KB lookup. (Can be one of `exact', ## 'leading_to_comma', `leading_to_number'.) CFG_BIBINDEX_SYNONYM_KBRS = { 'global': ['INDEX-SYNONYM-TITLE', 'exact'], 'title': ['INDEX-SYNONYM-TITLE', 'exact'], } ####################################### ## Part 7: Access control parameters ## ####################################### ## This section contains some configuration parameters for the access ## control system. Please note that WebAccess is mostly configured on ## run-time via its WebAccess Admin web interface. The parameters ## below are the ones that you do not probably want to modify very ## often during the runtime. (If you do want to modify them during ## runtime, for example te deny access temporarily because of backups, ## you can edit access_control_config.py directly, no need to get back ## here and no need to redo the make process.) ## CFG_ACCESS_CONTROL_LEVEL_SITE -- defines how open this site is. ## Use 0 for normal operation of the site, 1 for read-only site (all ## write operations temporarily closed), 2 for site fully closed, ## 3 for also disabling any database connection. ## Useful for site maintenance. CFG_ACCESS_CONTROL_LEVEL_SITE = 0 ## CFG_ACCESS_CONTROL_LEVEL_GUESTS -- guest users access policy. Use ## 0 to allow guest users, 1 not to allow them (all users must login). CFG_ACCESS_CONTROL_LEVEL_GUESTS = 0 ## CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS -- account registration and ## activation policy. When 0, users can register and accounts are ## automatically activated. When 1, users can register but admin must ## activate the accounts. When 2, users cannot register nor update ## their email address, only admin can register accounts. When 3, ## users cannot register nor update email address nor password, only ## admin can register accounts. When 4, the same as 3 applies, nor ## user cannot change his login method. When 5, then the same as 4 ## applies, plus info about how to get an account is hidden from the ## login page. CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS = 0 ## CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN -- limit account ## registration to certain email addresses? If wanted, give domain ## name below, e.g. "cern.ch". If not wanted, leave it empty. CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN = ## CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS -- send a ## notification email to the administrator when a new account is ## created? Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS = 0 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT -- send a ## notification email to the user when a new account is created in order to ## to verify the validity of the provided email address? Use ## 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT = 1 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_ACTIVATION -- send a ## notification email to the user when a new account is activated? ## Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_ACTIVATION = 0 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_DELETION -- send a ## notification email to the user when a new account is deleted or ## account demand rejected? Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_DELETION = 0 ## CFG_APACHE_PASSWORD_FILE -- the file where Apache user credentials ## are stored. Must be an absolute pathname. If the value does not ## start by a slash, it is considered to be the filename of a file ## located under prefix/var/tmp directory. This is useful for the ## demo site testing purposes. For the production site, if you plan ## to restrict access to some collections based on the Apache user ## authentication mechanism, you should put here an absolute path to ## your Apache password file. CFG_APACHE_PASSWORD_FILE = demo-site-apache-user-passwords ## CFG_APACHE_GROUP_FILE -- the file where Apache user groups are ## defined. See the documentation of the preceding config variable. CFG_APACHE_GROUP_FILE = demo-site-apache-user-groups ################################### ## Part 8: WebSession parameters ## ################################### ## This section contains some configuration parameters for tweaking ## session handling. ## CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT -- number of days after which a session ## and the corresponding cookie is considered expired. CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT = 2 ## CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER -- number of days after which a session ## and the corresponding cookie is considered expired, when the user has ## requested to permanently stay logged in. CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER = 365 ## CFG_WEBSESSION_RESET_PASSWORD_EXPIRE_IN_DAYS -- when user requested ## a password reset, for how many days is the URL valid? CFG_WEBSESSION_RESET_PASSWORD_EXPIRE_IN_DAYS = 3 ## CFG_WEBSESSION_ADDRESS_ACTIVATION_EXPIRE_IN_DAYS -- when an account ## activation email was sent, for how many days is the URL valid? CFG_WEBSESSION_ADDRESS_ACTIVATION_EXPIRE_IN_DAYS = 3 ## CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS -- when ## user won't confirm his email address and not complete ## registeration, after how many days will it expire? CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS = 10 ## CFG_WEBSESSION_DIFFERENTIATE_BETWEEN_GUESTS -- when set to 1, the session ## system allocates the same uid=0 to all guests users regardless of where they ## come from. 0 allocate a unique uid to each guest. CFG_WEBSESSION_DIFFERENTIATE_BETWEEN_GUESTS = 0 ## CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS -- to prevent session cookie ## stealing, Invenio checks that the IP address of a connection is the ## same as that of the connection which created the initial session. ## This variable let you decide how many bits should be skipped during ## this check. Set this to 0 in order to enable full IP address ## checking. Set this to 32 in order to disable IP address checking. ## Intermediate values (say 8) let you have some degree of security so ## that you can trust your local network only while helping to solve ## issues related to outside clients that configured their browser to ## use a web proxy for HTTP connection but not for HTTPS, thus ## potentially having two different IP addresses. In general, if use ## HTTPS in order to serve authenticated content, you can safely set ## CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS to 32. CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS = 0 ################################ ## Part 9: BibRank parameters ## ################################ ## This section contains some configuration parameters for the ranking ## system. ## CFG_BIBRANK_SHOW_READING_STATS -- do we want to show reading ## similarity stats? ('People who viewed this page also viewed') CFG_BIBRANK_SHOW_READING_STATS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_STATS -- do we want to show the download ## similarity stats? ('People who downloaded this document also ## downloaded') CFG_BIBRANK_SHOW_DOWNLOAD_STATS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS -- do we want to show download ## history graph? (0=no | 1=classic/gnuplot | 2=flot) CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION -- do we ## want to show a graph representing the distribution of client IPs ## downloading given document? (0=no | 1=classic/gnuplot | 2=flot) CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION = 0 ## CFG_BIBRANK_SHOW_CITATION_LINKS -- do we want to show the 'Cited ## by' links? (useful only when you have citations in the metadata) CFG_BIBRANK_SHOW_CITATION_LINKS = 1 ## CFG_BIBRANK_SHOW_CITATION_STATS -- de we want to show citation ## stats? ('Cited by M recors', 'Co-cited with N records') CFG_BIBRANK_SHOW_CITATION_STATS = 1 ## CFG_BIBRANK_SHOW_CITATION_GRAPHS -- do we want to show citation ## history graph? (0=no | 1=classic/gnuplot | 2=flot) CFG_BIBRANK_SHOW_CITATION_GRAPHS = 1 ## CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID -- use authorids for computing ## self-citations ## falls back to hashing the author string CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID = 0 ## CFG_BIBRANK_SELFCITES_PRECOMPUTE -- use precomputed self-citations ## when displaying itesummary. Precomputing citations allows use to ## speed up things CFG_BIBRANK_SELFCITES_PRECOMPUTE = 0 #################################### ## Part 10: WebComment parameters ## #################################### ## This section contains some configuration parameters for the ## commenting and reviewing facilities. ## CFG_WEBCOMMENT_ALLOW_COMMENTS -- do we want to allow users write ## public comments on records? CFG_WEBCOMMENT_ALLOW_COMMENTS = 1 ## CFG_WEBCOMMENT_ALLOW_REVIEWS -- do we want to allow users write ## public reviews of records? CFG_WEBCOMMENT_ALLOW_REVIEWS = 1 ## CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS -- do we want to allow short ## reviews, that is just the attribution of stars without submitting ## detailed review text? CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS = 0 ## CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN -- if users ## report a comment to be abusive, how many they have to be before the ## site admin is alerted? CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN = 5 ## CFG_WEBCOMMENT_NB_COMMENTS_IN_DETAILED_VIEW -- how many comments do ## we display in the detailed record page upon welcome? CFG_WEBCOMMENT_NB_COMMENTS_IN_DETAILED_VIEW = 1 ## CFG_WEBCOMMENT_NB_REVIEWS_IN_DETAILED_VIEW -- how many reviews do ## we display in the detailed record page upon welcome? CFG_WEBCOMMENT_NB_REVIEWS_IN_DETAILED_VIEW = 1 ## CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL -- do we notify the site ## admin after every comment? CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL = 1 ## CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS -- how many ## elapsed seconds do we consider enough when checking for possible ## multiple comment submissions by a user? CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS = 20 ## CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS -- how many ## elapsed seconds do we consider enough when checking for possible ## multiple review submissions by a user? CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS = 20 ## CFG_WEBCOMMENT_USE_RICH_EDITOR -- enable the WYSIWYG ## Javascript-based editor when user edits comments? CFG_WEBCOMMENT_USE_RICH_TEXT_EDITOR = False ## CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL -- the email address from which the ## alert emails will appear to be sent: CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = info@invenio-software.org ## CFG_WEBCOMMENT_DEFAULT_MODERATOR -- if no rules are ## specified to indicate who is the comment moderator of ## a collection, this person will be used as default CFG_WEBCOMMENT_DEFAULT_MODERATOR = info@invenio-software.org ## CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS -- do we want to allow the use ## of MathJax plugin to render latex input in comments? CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS = 1 ## CFG_WEBCOMMENT_AUTHOR_DELETE_COMMENT_OPTION -- allow comment author to ## delete its own comment? CFG_WEBCOMMENT_AUTHOR_DELETE_COMMENT_OPTION = 1 # CFG_WEBCOMMENT_EMAIL_REPLIES_TO -- which field of the record define # email addresses that should be notified of newly submitted comments, # and for which collection. Use collection names as keys, and list of # tags as values CFG_WEBCOMMENT_EMAIL_REPLIES_TO = { 'Articles': ['506__d', '506__m'], } # CFG_WEBCOMMENT_RESTRICTION_DATAFIELD -- which field of the record # define the restriction (must be linked to WebAccess # 'viewrestrcomment') to apply to newly submitted comments, and for # which collection. Use collection names as keys, and one tag as value CFG_WEBCOMMENT_RESTRICTION_DATAFIELD = { 'Articles': '5061_a', 'Pictures': '5061_a', 'Theses': '5061_a', } # CFG_WEBCOMMENT_ROUND_DATAFIELD -- which field of the record define # the current round of comment for which collection. Use collection # name as key, and one tag as value CFG_WEBCOMMENT_ROUND_DATAFIELD = { 'Articles': '562__c', 'Pictures': '562__c', } # CFG_WEBCOMMENT_MAX_ATTACHMENT_SIZE -- max file size per attached # file, in bytes. Choose 0 if you don't want to limit the size CFG_WEBCOMMENT_MAX_ATTACHMENT_SIZE = 5242880 # CFG_WEBCOMMENT_MAX_ATTACHED_FILES -- maxium number of files that can # be attached per comment. Choose 0 if you don't want to limit the # number of files. File uploads can be restricted with action # "attachcommentfile". CFG_WEBCOMMENT_MAX_ATTACHED_FILES = 5 # CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH -- how many levels of # indentation discussions can be. This can be used to ensure that # discussions will not go into deep levels of nesting if users don't # understand the difference between "reply to comment" and "add # comment". When the depth is reached, any "reply to comment" is # conceptually converted to a "reply to thread" (i.e. reply to this # parent's comment). Use -1 for no limit, 0 for unthreaded (flat) # discussions. CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH = 1 ################################## ## Part 11: BibSched parameters ## ################################## ## This section contains some configuration parameters for the ## bibliographic task scheduler. ## CFG_BIBSCHED_REFRESHTIME -- how often do we want to refresh ## bibsched monitor? (in seconds) CFG_BIBSCHED_REFRESHTIME = 5 ## CFG_BIBSCHED_LOG_PAGER -- what pager to use to view bibsched task ## logs? CFG_BIBSCHED_LOG_PAGER = /usr/bin/less ## CFG_BIBSCHED_EDITOR -- what editor to use to edit the marcxml ## code of the locked records CFG_BIBSCHED_EDITOR = /usr/bin/vim ## CFG_BIBSCHED_GC_TASKS_OLDER_THAN -- after how many days to perform the ## gargbage collector of BibSched queue (i.e. removing/moving task to archive). CFG_BIBSCHED_GC_TASKS_OLDER_THAN = 30 ## CFG_BIBSCHED_GC_TASKS_TO_REMOVE -- list of BibTask that can be safely ## removed from the BibSched queue once they are DONE. CFG_BIBSCHED_GC_TASKS_TO_REMOVE = bibindex,bibreformat,webcoll,bibrank,inveniogc ## CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE -- list of BibTasks that should be safely ## archived out of the BibSched queue once they are DONE. CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE = bibupload,oairepositoryupdater ## CFG_BIBSCHED_MAX_NUMBER_CONCURRENT_TASKS -- maximum number of BibTasks ## that can run concurrently. ## NOTE: concurrent tasks are still considered as an experimental ## feature. Please keep this value set to 1 on production environments. CFG_BIBSCHED_MAX_NUMBER_CONCURRENT_TASKS = 1 ## CFG_BIBSCHED_PROCESS_USER -- bibsched and bibtask processes must ## usually run under the same identity as the Apache web server ## process in order to share proper file read/write privileges. If ## you want to force some other bibsched/bibtask user, e.g. because ## you are using a local `invenio' user that belongs to your ## `www-data' Apache user group and so shares writing rights with your ## Apache web server process in this way, then please set its username ## identity here. Otherwise we shall check whether your ## bibsched/bibtask processes are run under the same identity as your ## Apache web server process (in which case you can leave the default ## empty value here). CFG_BIBSCHED_PROCESS_USER = ## CFG_BIBSCHED_NODE_TASKS -- specific nodes may be configured to ## run only specific tasks; if you want this, then this variable is a ## dictionary of the form {'hostname1': ['task1', 'task2']}. The ## default is that any node can run any task. CFG_BIBSCHED_NODE_TASKS = {} ## CFG_BIBSCHED_MAX_ARCHIVED_ROWS_DISPLAY -- number of tasks displayed ## CFG_BIBSCHED_MAX_ARCHIVED_ROWS_DISPLAY = 500 ################################### ## Part 12: WebBasket parameters ## ################################### ## CFG_WEBBASKET_MAX_NUMBER_OF_DISPLAYED_BASKETS -- a safety limit for ## a maximum number of displayed baskets CFG_WEBBASKET_MAX_NUMBER_OF_DISPLAYED_BASKETS = 20 ## CFG_WEBBASKET_USE_RICH_TEXT_EDITOR -- enable the WYSIWYG ## Javascript-based editor when user edits comments in WebBasket? CFG_WEBBASKET_USE_RICH_TEXT_EDITOR = False ################################## ## Part 13: WebAlert parameters ## ################################## ## This section contains some configuration parameters for the ## automatic email notification alert system. ## CFG_WEBALERT_ALERT_ENGINE_EMAIL -- the email address from which the ## alert emails will appear to be sent: CFG_WEBALERT_ALERT_ENGINE_EMAIL = info@invenio-software.org ## CFG_WEBALERT_MAX_NUM_OF_RECORDS_IN_ALERT_EMAIL -- how many records ## at most do we send in an outgoing alert email? CFG_WEBALERT_MAX_NUM_OF_RECORDS_IN_ALERT_EMAIL = 20 ## CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL -- number of ## chars per line in an outgoing alert email? CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL = 72 ## CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES -- when sending alert ## emails fails, how many times we retry? CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES = 3 ## CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES -- when sending ## alert emails fails, what is the sleeptime between tries? (in ## seconds) CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES = 300 #################################### ## Part 14: WebMessage parameters ## #################################### ## CFG_WEBMESSAGE_MAX_SIZE_OF_MESSAGE -- how large web messages do we ## allow? CFG_WEBMESSAGE_MAX_SIZE_OF_MESSAGE = 20000 ## CFG_WEBMESSAGE_MAX_NB_OF_MESSAGES -- how many messages for a ## regular user do we allow in its inbox? CFG_WEBMESSAGE_MAX_NB_OF_MESSAGES = 30 ## CFG_WEBMESSAGE_DAYS_BEFORE_DELETE_ORPHANS -- how many days before ## we delete orphaned messages? CFG_WEBMESSAGE_DAYS_BEFORE_DELETE_ORPHANS = 60 ################################## ## Part 15: MiscUtil parameters ## ################################## ## CFG_MISCUTIL_SQL_USE_SQLALCHEMY -- whether to use SQLAlchemy.pool ## in the DB engine of Invenio. It is okay to enable this flag ## even if you have not installed SQLAlchemy. Note that Invenio will ## loose some perfomance if this option is enabled. CFG_MISCUTIL_SQL_USE_SQLALCHEMY = False ## CFG_MISCUTIL_SQL_RUN_SQL_MANY_LIMIT -- how many queries can we run ## inside run_sql_many() in one SQL statement? The limit value ## depends on MySQL's max_allowed_packet configuration. CFG_MISCUTIL_SQL_RUN_SQL_MANY_LIMIT = 10000 ## CFG_MISCUTIL_SMTP_HOST -- which server to use as outgoing mail server to ## send outgoing emails generated by the system, for example concerning ## submissions or email notification alerts. CFG_MISCUTIL_SMTP_HOST = localhost ## CFG_MISCUTIL_SMTP_PORT -- which port to use on the outgoing mail server ## defined in the previous step. CFG_MISCUTIL_SMTP_PORT = 25 ## CFG_MISCUTIL_SMTP_USER -- which username to use on the outgoing mail server ## defined in CFG_MISCUTIL_SMTP_HOST. If either CFG_MISCUTIL_SMTP_USER or ## CFG_MISCUTIL_SMTP_PASS are empty Invenio won't attempt authentication. CFG_MISCUTIL_SMTP_USER = ## CFG_MISCUTIL_SMTP_PASS -- which password to use on the outgoing mail ## server defined in CFG_MISCUTIL_SMTP_HOST. If either CFG_MISCUTIL_SMTP_USER ## or CFG_MISCUTIL_SMTP_PASS are empty Invenio won't attempt authentication. CFG_MISCUTIL_SMTP_PASS = ## CFG_MISCUTIL_SMTP_TLS -- whether to use a TLS (secure) connection when ## talking to the SMTP server defined in CFG_MISCUTIL_SMTP_HOST. CFG_MISCUTIL_SMTP_TLS = False ## CFG_MISCUTILS_DEFAULT_PROCESS_TIMEOUT -- the default number of seconds after ## which a process launched trough shellutils.run_process_with_timeout will ## be killed. This is useful to catch runaway processes. CFG_MISCUTIL_DEFAULT_PROCESS_TIMEOUT = 300 ## CFG_MATHJAX_HOSTING -- if you plan to use MathJax to display TeX ## formulas on HTML web pages, you can specify whether you wish to use ## 'local' hosting or 'cdn' hosting of MathJax libraries. (If set to ## 'local', you have to run 'make install-mathjax-plugin' as described ## in the INSTALL guide.) If set to 'local', users will use your site ## to download MathJax sources. If set to 'cdn', users will use ## centralized MathJax CDN servers instead. Please note that using ## CDN is suitable only for small institutions or for MathJax ## sponsors; see the MathJax website for more details. (Also, please ## note that if you plan to use MathJax on your site, you have to ## adapt CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS and ## CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS configuration variables ## elsewhere in this file.) CFG_MATHJAX_HOSTING = local ################################# ## Part 16: BibEdit parameters ## ################################# ## CFG_BIBEDIT_TIMEOUT -- when a user edits a record, this record is ## locked to prevent other users to edit it at the same time. ## How many seconds of inactivity before the locked record again will be free ## for other people to edit? CFG_BIBEDIT_TIMEOUT = 3600 ## CFG_BIBEDIT_LOCKLEVEL -- when a user tries to edit a record which there ## is a pending bibupload task for in the queue, this shouldn't be permitted. ## The lock level determines how thouroughly the queue should be investigated ## to determine if this is the case. ## Level 0 - always permits editing, doesn't look at the queue ## (unsafe, use only if you know what you are doing) ## Level 1 - permits editing if there are no queued bibedit tasks for this record ## (safe with respect to bibedit, but not for other bibupload maintenance jobs) ## Level 2 - permits editing if there are no queued bibupload tasks of any sort ## (safe, but may lock more than necessary if many cataloguers around) ## Level 3 - permits editing if no queued bibupload task concerns given record ## (safe, most precise locking, but slow, ## checks for 001/EXTERNAL_SYSNO_TAG/EXTERNAL_OAIID_TAG) ## The recommended level is 3 (default) or 2 (if you use maintenance jobs often). CFG_BIBEDIT_LOCKLEVEL = 3 ## CFG_BIBEDIT_PROTECTED_FIELDS -- a comma-separated list of fields that BibEdit ## will not allow to be added, edited or deleted. Wildcards are not supported, ## but conceptually a wildcard is added at the end of every field specification. ## Examples: ## 500A - protect all MARC fields with tag 500 and first indicator A ## 5 - protect all MARC fields in the 500-series. ## 909C_a - protect subfield a in tag 909 with first indicator C and empty ## second indicator ## Note that 001 is protected by default, but if protection of other ## identifiers or automated fields is a requirement, they should be added to ## this list. CFG_BIBEDIT_PROTECTED_FIELDS = ## CFG_BIBEDIT_QUEUE_CHECK_METHOD -- how do we want to check for ## possible queue locking situations to prevent cataloguers from ## editing a record that may be waiting in the queue? Use 'bibrecord' ## for exact checking (always works, but may be slow), use 'regexp' ## for regular expression based checking (very fast, but may be ## inaccurate). When unsure, use 'bibrecord'. CFG_BIBEDIT_QUEUE_CHECK_METHOD = bibrecord ## CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE -- a dictionary ## containing which collections will be extended with a given template ## while being displayed in BibEdit UI. The collection corresponds with ## the value written in field 980 CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE = { 'POETRY' : 'record_poem'} ## CFG_BIBEDIT_KB_SUBJECTS - Name of the KB used in the field 65017a ## to automatically convert codes into extended version. e.g ## a - Astrophysics CFG_BIBEDIT_KB_SUBJECTS = Subjects ## CFG_BIBEDIT_KB_INSTITUTIONS - Name of the KB used for institution ## autocomplete. To be applied in fields defined in ## CFG_BIBEDIT_AUTOCOMPLETE_INSTITUTIONS_FIELDS CFG_BIBEDIT_KB_INSTITUTIONS = InstitutionsCollection ## CFG_BIBEDIT_AUTOCOMPLETE_INSTITUTIONS_FIELDS - list of fields to ## be autocompleted with the KB CFG_BIBEDIT_KB_INSTITUTIONS CFG_BIBEDIT_AUTOCOMPLETE_INSTITUTIONS_FIELDS = 100__u,700__u,701__u,502__c ## CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING -- maximum number of records ## that can be modified instantly using the multi-record editor. Above ## this limit, modifications will only be executed in limited hours. CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING = 2000 ## CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING -- maximum number of records ## that can be send for modification without having a superadmin role. ## If the number of records is between CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING ## and this number, the modifications will take place only in limited hours. CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING = 20000 ## CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING_TIME -- Allowed time to ## execute modifications on records, when the number exceeds ## CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING. CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING_TIME = 22:00-05:00 ################################### ## Part 17: BibUpload parameters ## ################################### ## CFG_BIBUPLOAD_REFERENCE_TAG -- where do we store references? CFG_BIBUPLOAD_REFERENCE_TAG = 999 ## CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG -- where do we store external ## system numbers? Useful for matching when our records come from an ## external digital library system. CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG = 970__a ## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG -- where do we store OAI ID tags ## of harvested records? Useful for matching when we harvest stuff ## via OAI that we do not want to reexport via Invenio OAI; so records ## may have only the source OAI ID stored in this tag (kind of like ## external system number too). CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG = 035__a ## CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG -- where do we store OAI SRC ## tags of harvested records? Useful for matching when we harvest stuff ## via OAI that we do not want to reexport via Invenio OAI; so records ## may have only the source OAI SRC stored in this tag (kind of like ## external system number too). Note that the field should be the same of ## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG = 035__9 ## CFG_BIBUPLOAD_STRONG_TAGS -- a comma-separated list of tags that ## are strong enough to resist the replace mode. Useful for tags that ## might be created from an external non-metadata-like source, ## e.g. the information about the number of copies left. CFG_BIBUPLOAD_STRONG_TAGS = 964 ## CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS -- a comma-separated list ## of tags that contain provenance information that should be checked ## in the bibupload correct mode via matching provenance codes. (Only ## field instances of the same provenance information would be acted ## upon.) Please specify the whole tag info up to subfield codes. CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS = 6531_9 ## CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS -- a comma-separated list of system ## paths from which it is allowed to take fulltextes that will be uploaded via ## FFT (CFG_TMPDIR is included by default). CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS = /tmp,/home ## CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS -- a dictionary containing ## external URLs that can be accessed by Invenio and specific HTTP ## headers that will be used for each URL. The keys of the dictionary ## are regular expressions matching a set of URLs, the values are ## dictionaries of headers as consumed by urllib2.Request. If a ## regular expression matching all URLs is created at the end of the ## list, it means that Invenio will download all URLs. Otherwise ## Invenio will just download authorized URLs. Note: by default, a ## User-Agent built after the current Invenio version, site name, and ## site URL will be used. The values of the header dictionary can ## also contain a call to a python function, in the form of a ## disctionary with two entries: the name of the function to be called ## as a value for the 'fnc' key, and the arguments to this function, ## as a value for the 'args' key (in the form of a dictionary). ## CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [ ## ('http://myurl.com/.*', {'User-Agent': 'Me'}), ## ('http://yoururl.com/.*', {'User-Agent': 'You', 'Accept': 'text/plain'}), ## ('http://thisurl.com/.*', {'Cookie': {'fnc':'read_cookie', 'args':{'cookiefile':'/tmp/cookies.txt'}}}) ## ('http://.*', {'User-Agent': 'Invenio'}), ## ] CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [ ('http(s)?://.*', {}), ] ## CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE -- do we want to serialize ## internal representation of records (Pythonic record structure) into ## the database? This can improve internal processing speed of some ## operations at the price of somewhat bigger disk space usage. ## If you change this value after some records have already been added ## to your installation, you may want to run: ## $ /opt/invenio/bin/inveniocfg --reset-recstruct-cache ## in order to either erase the cache thus freeing database space, ## or to fill the cache for all records that have not been cached yet. CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE = 1 ## CFG_BIBUPLOAD_DELETE_FORMATS -- which formats do we want bibupload ## to delete when a record is ingested? Enter comma-separated list of ## formats. For example, 'hb,hd' will delete pre-formatted HTML brief ## and defailed formats from cache, so that search engine will ## generate them on-the-fly. Useful to always present latest data of ## records upon record display, until the periodical bibreformat job ## runs next and updates the cache. CFG_BIBUPLOAD_DELETE_FORMATS = hb ## CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY -- a comma-separated list ## indicating which fields match the file names of the documents to be ## uploaded. ## The matching will be done in the same order as the list provided. CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY = reportnumber,recid ## CFG_BATCHUPLOADER_DAEMON_DIR -- Directory where the batchuploader daemon ## will look for the subfolders metadata and document by default. ## If path is relative, CFG_PREFIX will be joined as a prefix CFG_BATCHUPLOADER_DAEMON_DIR = var/batchupload ## CFG_BATCHUPLOADER_WEB_ROBOT_AGENTS -- Regular expression to specify the ## agents permitted when calling batch uploader web interface ## cds.cern.ch/batchuploader/robotupload ## if using a curl, eg: curl xxx -A invenio CFG_BATCHUPLOADER_WEB_ROBOT_AGENTS = invenio_webupload|Invenio-.* ## CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS -- Access list specifying for each ## IP address, which collections are allowed using batch uploader robot ## interface. CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS = { '127.0.0.1': ['*'], # useful for testing '127.0.1.1': ['*'], # useful for testing '10.0.0.1': ['BOOK', 'REPORT'], # Example 1 '10.0.0.2': ['POETRY', 'PREPRINT'], # Example 2 } #################################### ## Part 18: BibCatalog parameters ## #################################### ## CFG_BIBCATALOG_SYSTEM -- set desired catalog system. For example, RT. CFG_BIBCATALOG_SYSTEM = ## RT CONFIGURATION ## CFG_BIBCATALOG_SYSTEM_RT_CLI -- path to the RT CLI client CFG_BIBCATALOG_SYSTEM_RT_CLI = /usr/bin/rt ## CFG_BIBCATALOG_SYSTEM_RT_URL -- Base URL of the remote RT system CFG_BIBCATALOG_SYSTEM_RT_URL = http://localhost/rt3 ## CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER -- Set the username for a default RT account ## on remote system, with limited privileges, in order to only create and modify own tickets. CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER = ## CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD -- Set the password for the default RT account ## on remote system. CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD = #################################### ## Part 19: BibFormat parameters ## #################################### ## CFG_BIBFORMAT_HIDDEN_TAGS -- comma-separated list of MARC tags that ## are not shown to users not having cataloging authorizations. CFG_BIBFORMAT_HIDDEN_TAGS = 595 ## CFG_BIBFORMAT_HIDDEN_FILE_FORMATS -- comma-separated list of file formats ## that are not shown explicitly to user not having cataloging authorizations. ## e.g. pdf;pdfa,xml CFG_BIBFORMAT_HIDDEN_FILE_FORMATS = ## CFG_BIBFORMAT_ADDTHIS_ID -- if you want to use the AddThis service from ## , set this value to the pubid parameter as ## provided by the service (e.g. ra-4ff80aae118f4dad), and add a call to ## formatting element in your formats, for example ## Default_HTML_detailed.bft. CFG_BIBFORMAT_ADDTHIS_ID = ## CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS -- For each output ## format BibReformat currently creates a cache for only one language ## (CFG_SITE_LANG) per record. This means that visitors having set a ## different language than CFG_SITE_LANG will be served an on-the-fly ## output using the language of their choice. You can disable this ## behaviour by specifying below for which output format you would ## like to force the cache to be used whatever language is ## requested. If your format templates do not provide ## internationalization, you can optimize your site by setting for ## eg. hb,hd to always serve the precached output (if it exists) in ## the CFG_SITE_LANG CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS = #################################### ## Part 20: BibMatch parameters ## #################################### ## CFG_BIBMATCH_LOCAL_SLEEPTIME -- Determines the amount of seconds to sleep ## between search queries on LOCAL system. CFG_BIBMATCH_LOCAL_SLEEPTIME = 0.0 ## CFG_BIBMATCH_REMOTE_SLEEPTIME -- Determines the amount of seconds to sleep ## between search queries on REMOTE systems. CFG_BIBMATCH_REMOTE_SLEEPTIME = 2.0 ## CFG_BIBMATCH_FUZZY_WORDLIMITS -- Determines the amount of words to extract ## from a certain fields value during fuzzy matching mode. Add/change field ## and appropriate number to the dictionary to configure this. CFG_BIBMATCH_FUZZY_WORDLIMITS = { '100__a': 2, '245__a': 4 } ## CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT -- Determines the amount of empty results ## to accept during fuzzy matching mode. CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT = 1 ## CFG_BIBMATCH_QUERY_TEMPLATES -- Here you can set the various predefined querystrings ## used to standardize common matching queries. By default the following templates ## are given: ## title - standard title search. Taken from 245__a (default) ## title-author - title and author search (i.e. this is a title AND author a) ## Taken from 245__a and 100__a ## reportnumber - reportnumber search (i.e. reportnumber:REP-NO-123). CFG_BIBMATCH_QUERY_TEMPLATES = { 'title' : '[title]', 'title-author' : '[title] [author]', 'reportnumber' : 'reportnumber:[reportnumber]' } ## CFG_BIBMATCH_MATCH_VALIDATION_RULESETS -- Here you can define the various rulesets for ## validating search results done by BibMatch. Each ruleset contains a certain pattern mapped ## to a tuple defining a "matching-strategy". ## ## The rule-definitions must come in two parts: ## ## * The first part is a string containing a regular expression ## that is matched against the textmarc representation of each record. ## If a match is found, the final rule-set is updated with ## the given "sub rule-set", where identical tag rules are replaced. ## ## * The second item is a list of key->value mappings (dict) that indicates specific ## strategy parameters with corresponding validation rules. ## ## This strategy consists of five items: ## ## * MARC TAGS: ## These MARC tags represents the fields taken from original record and any records from search ## results. When several MARC tags are specified with a given match-strategy, all the fields ## associated with these tags are matched together (i.e. with key "100__a,700__a", all 100__a ## and 700__a fields are matched together. Which is useful when first-author can vary for ## certain records on different systems). ## ## * COMPARISON THRESHOLD: ## a value between 0.0 and 1.0 specifying the threshold for string matches ## to determine if it is a match or not (using normalized string-distance). ## Normally 0.8 (80% match) is considered to be a close match. ## ## * COMPARISON MODE: ## the parse mode decides how the record datafields are compared: ## - 'strict' : all (sub-)fields are compared, and all must match. Order is significant. ## - 'normal' : all (sub-)fields are compared, and all must match. Order is ignored. ## - 'lazy' : all (sub-)fields are compared with each other and at least one must match ## - 'ignored': the tag is ignored in the match. Used to disable previously defined rules. ## ## * MATCHING MODE: ## the comparison mode decides how the fieldvalues are matched: ## - 'title' : uses a method specialized for comparing titles, e.g. looking for subtitles ## - 'author' : uses a special authorname comparison. Will take initials into account. ## - 'identifier' : special matching for identifiers, stripping away punctuation ## - 'date': matches dates by extracting and comparing the year ## - 'normal': normal string comparison. ## Note: Fields are considered matching when all its subfields or values match. ## ## * RESULT MODE: ## the result mode decides how the results from the comparisons are handled further: ## - 'normal' : a failed match will cause the validation to immediately exit as a failure. ## a successful match will cause the validation to continue on other rules (if any) ## - 'final' : a failed match will cause the validation to immediately exit as a failure. ## a successful match will cause validation to immediately exit as a success. ## - 'joker' : a failed match will cause the validation to continue on other rules (if any). ## a successful match will cause validation to immediately exit as a success. ## ## You can add your own rulesets in the dictionary below. The 'default' ruleset is always applied, ## and should therefore NOT be removed, but can be changed. The tag-rules can also be overwritten ## by other rulesets. ## ## WARNING: Beware that the validation quality is only as good as given rules, so matching results ## are never guaranteed to be accurate, as it is very content-specific. CFG_BIBMATCH_MATCH_VALIDATION_RULESETS = [('default', [{ 'tags' : '245__%,242__%', 'threshold' : 0.8, 'compare_mode' : 'lazy', 'match_mode' : 'title', 'result_mode' : 'normal' }, { 'tags' : '037__a,088__a', 'threshold' : 1.0, 'compare_mode' : 'lazy', 'match_mode' : 'identifier', 'result_mode' : 'final' }, { 'tags' : '100__a,700__a', 'threshold' : 0.8, 'compare_mode' : 'normal', 'match_mode' : 'author', 'result_mode' : 'normal' }, { 'tags' : '773__a', 'threshold' : 1.0, 'compare_mode' : 'lazy', 'match_mode' : 'title', 'result_mode' : 'normal' }]), ('980__ \$\$a(THESIS|Thesis)', [{ 'tags' : '100__a', 'threshold' : 0.8, 'compare_mode' : 'strict', 'match_mode' : 'author', 'result_mode' : 'normal' }, { 'tags' : '700__a,701__a', 'threshold' : 1.0, 'compare_mode' : 'lazy', 'match_mode' : 'author', 'result_mode' : 'normal' }, { 'tags' : '100__a,700__a', 'threshold' : 0.8, 'compare_mode' : 'ignored', 'match_mode' : 'author', 'result_mode' : 'normal' }]), ('260__', [{ 'tags' : '260__c', 'threshold' : 0.8, 'compare_mode' : 'lazy', 'match_mode' : 'date', 'result_mode' : 'normal' }]), ('0247_', [{ 'tags' : '0247_a', 'threshold' : 1.0, 'compare_mode' : 'lazy', 'match_mode' : 'identifier', 'result_mode' : 'final' }]), ('020__', [{ 'tags' : '020__a', 'threshold' : 1.0, 'compare_mode' : 'lazy', 'match_mode' : 'identifier', 'result_mode' : 'joker' }]) ] ## CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT -- Determines the minimum percentage of the ## amount of rules to be positively matched when comparing two records. Should the number ## of matches be lower than required matches but equal to or above this limit, ## the match will be considered fuzzy. CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT = 0.65 ## CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT -- Determines the maximum amount of search results ## a single search can return before acting as a non-match. CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT = 15 ###################################### ## Part 21: BibAuthorID parameters ## ###################################### # CFG_BIBAUTHORID_MAX_PROCESSES is the max number of processes # that may be spawned by the disambiguation algorithm CFG_BIBAUTHORID_MAX_PROCESSES = 12 # CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS is the max number of threads # to parallelize sql queries during personID tables updates CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS = 12 # CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY defines the user info # keys for externally claimed records in an remote-login scenario--e.g. from arXiv.org # e.g. "external_arxivids" for arXiv SSO CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY = # CFG_BIBAUTHORID_AID_ENABLED # Globally enable AuthorID Interfaces. # If False: No guest, user or operator will have access to the system. CFG_BIBAUTHORID_ENABLED = True # CFG_BIBAUTHORID_AID_ON_AUTHORPAGES # Enable AuthorID information on the author pages. CFG_BIBAUTHORID_ON_AUTHORPAGES = True # CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL defines the eMail address # all ticket requests concerning authors will be sent to. CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL = info@invenio-software.org #CFG_BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE defines if the optional arXive stub page is skipped CFG_BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE = False ######################################### ## Part 22: BibCirculation parameters ## ######################################### ## CFG_BIBCIRCULATION_ITEM_STATUS_OPTIONAL -- comma-separated list of statuses # Example: missing, order delayed, not published # You can allways add a new status here, but you may want to run some script # to update the database if you remove some statuses. CFG_BIBCIRCULATION_ITEM_STATUS_OPTIONAL = ## Here you can edit the text of the statuses that have specific roles. # You should run a script to update the database if you change them after having # used the module for some time. ## Item statuses # The book is on loan CFG_BIBCIRCULATION_ITEM_STATUS_ON_LOAN = on loan # Available for loan CFG_BIBCIRCULATION_ITEM_STATUS_ON_SHELF = on shelf # The book is being processed by the library (cataloguing, etc.) CFG_BIBCIRCULATION_ITEM_STATUS_IN_PROCESS = in process # The book has been ordered (bought) CFG_BIBCIRCULATION_ITEM_STATUS_ON_ORDER = on order # The order of the book has been cancelled CFG_BIBCIRCULATION_ITEM_STATUS_CANCELLED = cancelled # The order of the book has not arrived yet CFG_BIBCIRCULATION_ITEM_STATUS_NOT_ARRIVED = not arrived # The order of the book has not arrived yet and has been claimed CFG_BIBCIRCULATION_ITEM_STATUS_CLAIMED = claimed # The book has been proposed for acquisition and is under review. CFG_BIBCIRCULATION_ITEM_STATUS_UNDER_REVIEW = under review ## Loan statuses # This status should not be confussed with CFG_BIBCIRCULATION_ITEM_STATUS_ON_LOAN. # If the item status is CFG_BIBCIRCULATION_ITEM_STATUS_ON_LOAN, then there is # a loan with status CFG_BIBCIRCULATION_LOAN_STATUS_ON_LOAN or # CFG_BIBCIRCULATION_LOAN_STATUS_EXPIRED. # For each copy, there can only be one active loan ('on loan' or 'expired') at # the time, since can be many 'returned' loans for the same copy. CFG_BIBCIRCULATION_LOAN_STATUS_ON_LOAN = on loan # The due date has come and the item has not been returned CFG_BIBCIRCULATION_LOAN_STATUS_EXPIRED = expired # The item has been returned. CFG_BIBCIRCULATION_LOAN_STATUS_RETURNED = returned ## Request statuses # There is at least one copy available, and this is the oldest request. CFG_BIBCIRCULATION_REQUEST_STATUS_PENDING = pending # There are no copies available, or there is another request with more priority. CFG_BIBCIRCULATION_REQUEST_STATUS_WAITING = waiting # The request has become a loan CFG_BIBCIRCULATION_REQUEST_STATUS_DONE = done # The request has been cancelled CFG_BIBCIRCULATION_REQUEST_STATUS_CANCELLED = cancelled # The request has been generated for a proposed book CFG_BIBCIRCULATION_REQUEST_STATUS_PROPOSED = proposed # ILL request statuses CFG_BIBCIRCULATION_ILL_STATUS_NEW = new CFG_BIBCIRCULATION_ILL_STATUS_REQUESTED = requested CFG_BIBCIRCULATION_ILL_STATUS_ON_LOAN = on loan CFG_BIBCIRCULATION_ILL_STATUS_RETURNED = returned CFG_BIBCIRCULATION_ILL_STATUS_CANCELLED = cancelled CFG_BIBCIRCULATION_ILL_STATUS_RECEIVED = received #Book proposal statuses CFG_BIBCIRCULATION_PROPOSAL_STATUS_NEW = proposal-new CFG_BIBCIRCULATION_PROPOSAL_STATUS_ON_ORDER = proposal-on order CFG_BIBCIRCULATION_PROPOSAL_STATUS_PUT_ASIDE = proposal-put aside CFG_BIBCIRCULATION_PROPOSAL_STATUS_RECEIVED = proposal-received # Purchase statuses CFG_BIBCIRCULATION_ACQ_STATUS_NEW = new CFG_BIBCIRCULATION_ACQ_STATUS_ON_ORDER = on order CFG_BIBCIRCULATION_ACQ_STATUS_PARTIAL_RECEIPT = partial receipt CFG_BIBCIRCULATION_ACQ_STATUS_RECEIVED = received CFG_BIBCIRCULATION_ACQ_STATUS_CANCELLED = cancelled ## Library types # Normal library where you have your books. I can also be a depot. CFG_BIBCIRCULATION_LIBRARY_TYPE_INTERNAL = internal # external libraries for ILL. CFG_BIBCIRCULATION_LIBRARY_TYPE_EXTERNAL = external # The main library is also an internal library. # Since you may have several depots or small sites you can tag one of them as # the main site. CFG_BIBCIRCULATION_LIBRARY_TYPE_MAIN = main # It is also an internal library. The copies in this type of library will NOT # be displayed to borrowers. Use this for depots. CFG_BIBCIRCULATION_LIBRARY_TYPE_HIDDEN = hidden ## Amazon access key. You will need your own key. # Example: 1T6P5M3ZDMW9AWJ212R2 CFG_BIBCIRCULATION_AMAZON_ACCESS_KEY = ###################################### ## Part 22: BibClassify parameters ## ###################################### # CFG_BIBCLASSIFY_WEB_MAXKW -- maximum number of keywords to display # in the Keywords tab web page. CFG_BIBCLASSIFY_WEB_MAXKW = 100 ######################################## ## Part 23: Plotextractor parameters ## ######################################## ## CFG_PLOTEXTRACTOR_SOURCE_BASE_URL -- for acquiring source tarballs for plot ## extraction, where should we look? If nothing is set, we'll just go ## to arXiv, but this can be a filesystem location, too CFG_PLOTEXTRACTOR_SOURCE_BASE_URL = http://arxiv.org/ ## CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER -- for acquiring source tarballs for plot ## extraction, subfolder where the tarballs sit CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER = e-print/ ## CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER -- for acquiring source tarballs for plot ## extraction, subfolder where the pdf sit CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER = pdf/ ## CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT -- a float representing the number of seconds ## to wait between each download of pdf and/or tarball from source URL. CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT = 2.0 ## CFG_PLOTEXTRACTOR_CONTEXT_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of characters in each direction to extract ## context from. Default 750. CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT = 750 ## CFG_PLOTEXTRACTOR_DISALLOWED_TEX -- when extracting context of plots from TeX ## sources, this is the list of TeX tags that will trigger 'end of context'. CFG_PLOTEXTRACTOR_DISALLOWED_TEX = begin,end,section,includegraphics,caption,acknowledgements ## CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of words in each direction. Default 75. CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT = 75 ## CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of sentences in each direction. Default 2. CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT = 2 ###################################### ## Part 24: WebStat parameters ## ###################################### # CFG_WEBSTAT_BIBCIRCULATION_START_YEAR defines the start date of the BibCirculation # statistics. Value should have the format 'yyyy'. If empty, take all existing data. CFG_WEBSTAT_BIBCIRCULATION_START_YEAR = ###################################### ## Part 25: Web API Key parameters ## ###################################### # CFG_WEB_API_KEY_ALLOWED_URL defines the web apps that are going to use the web # API key. It has three values, the name of the web app, the time of life for the # secure url and if a time stamp is needed. #CFG_WEB_API_KEY_ALLOWED_URL = [('search/\?', 3600, True), # ('rss', 0, False)] CFG_WEB_API_KEY_ALLOWED_URL = [] ########################################## ## Part 26: WebAuthorProfile parameters ## ########################################## #CFG_WEBAUTHORPROFILE_CACHE_EXPIRED_DELAY_LIVE consider a cached element expired after days #when loading an authorpage, thus recomputing the content live CFG_WEBAUTHORPROFILE_CACHE_EXPIRED_DELAY_LIVE = 7 #CFG_WEBAUTHORPROFILE_CACHE_EXPIRED_DELAY_BIBSCHED consider a cache element expired after days, #thus recompute it, bibsched daemon CFG_WEBAUTHORPROFILE_CACHE_EXPIRED_DELAY_BIBSCHED = 5 #CFG_WEBAUTHORPROFILE_MAX_COLLAB_LIST: limit collaboration list. #Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_COLLAB_LIST = 100 #CFG_WEBAUTHORPROFILE_MAX_KEYWORD_LIST: limit keywords list #Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_KEYWORD_LIST = 100 #CFG_WEBAUTHORPROFILE_MAX_AFF_LIST: limit affiliations list #Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_AFF_LIST = 100 #CFG_WEBAUTHORPROFILE_MAX_COAUTHOR_LIST: limit coauthors list #Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_COAUTHOR_LIST = 100 #CFG_WEBAUTHORPROFILE_MAX_HEP_CHOICES: limit HepRecords choices #Set to 0 to disable limit. CFG_WEBAUTHORPROFILE_MAX_HEP_CHOICES = 10 #CFG_WEBAUTHORPROFILE_USE_BIBAUTHORID: use bibauthorid or exactauthor CFG_WEBAUTHORPROFILE_USE_BIBAUTHORID = False #################################### ## Part 27: BibSort parameters ## #################################### ## CFG_BIBSORT_BUCKETS -- the number of buckets bibsort should use. ## If 0, then no buckets will be used (bibsort will be inactive). ## If different from 0, bibsort will be used for sorting the records. ## The number of buckets should be set with regards to the size ## of the repository; having a larger number of buckets will increase ## the sorting performance for the top results but will decrease ## the performance for sorting the middle results. ## We recommend to to use 1 in case you have less than about ## 1,000,000 records. ## When modifying this variable, re-run rebalancing for all the bibsort ## methods, for having the database in synch. CFG_BIBSORT_BUCKETS = 1 ######################################## ## Part 28: JsTestDriver parameters ## ######################################## ## CFG_JSTESTDRIVER_PORT -- server port where JS tests will be run. CFG_JSTESTDRIVER_PORT = 9876 ############################ ## Part 29: RefExtract ## ############################ ## Refextract can automatically submit tickets (after extracting refereces) ## to CFG_REFEXTRACT_TICKET_QUEUE if it is set CFG_REFEXTRACT_TICKET_QUEUE = None ## Override refextract kbs locations CFG_REFEXTRACT_KBS_OVERRIDE = {} ################################## ## Part 27: CrossRef parameters ## ################################## ## CFG_CROSSREF_USERNAME -- the username used when sending request ## to the Crossref site. CFG_CROSSREF_USERNAME = ## CFG_CROSSREF_PASSWORD -- the password used when sending request ## to the Crossref site. CFG_CROSSREF_PASSWORD = ##################################### ## Part 30: WebLinkback parameters ## ##################################### ## CFG_WEBLINKBACK_TRACKBACK_ENABLED -- whether to enable trackback support ## 1 to enable, 0 to disable it CFG_WEBLINKBACK_TRACKBACK_ENABLED = 0 #################################### ## Part 31: WebSubmit parameters ## #################################### ## CFG_WEBSUBMIT_USE_MATHJAX -- whether to use MathJax and math ## preview panel within submissions (1) or not (0). Customize your ## websubmit_template.tmpl_mathpreview_header() to enable for specific ## fields. ## See also CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS CFG_WEBSUBMIT_USE_MATHJAX = 0 #################################### ## Part 31: BibField parameters ## #################################### ## CFG_BIBFIELD_MASTER_FORMATS -- the name of all the allowed master formats ## that BibField will work with. CFG_BIBFIELD_MASTER_FORMATS = marc ########################## ## THAT's ALL, FOLKS! ## ########################## diff --git a/modules/bibdocfile/lib/bibdocfile.py b/modules/bibdocfile/lib/bibdocfile.py index ee008e26f..d02be56cc 100644 --- a/modules/bibdocfile/lib/bibdocfile.py +++ b/modules/bibdocfile/lib/bibdocfile.py @@ -1,4812 +1,4818 @@ ## This file is part of Invenio. ## Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ This module implements the low-level API for dealing with fulltext files. - All the files associated to a I{record} (identified by a I{recid}) can be managed via an instance of the C{BibRecDocs} class. - A C{BibRecDocs} is a wrapper of the list of I{documents} attached to the record. - Each document is represented by an instance of the C{BibDoc} class. - A document is identified by a C{docid} and name (C{docname}). The docname must be unique within the record. A document is the set of all the formats and revisions of a piece of information. - A document has a type called C{doctype} and can have a restriction. - Each physical file, i.e. the concretization of a document into a particular I{version} and I{format} is represented by an instance of the C{BibDocFile} class. - The format is infact the extension of the physical file. - A comment and a description and other information can be associated to a BibDocFile. - A C{bibdoc} is a synonim for a document, while a C{bibdocfile} is a synonim for a physical file. @group Main classes: BibRecDocs,BibDoc,BibDocFile @group Other classes: BibDocMoreInfo,Md5Folder,InvenioBibDocFileError @group Main functions: decompose_file,stream_file,bibdocfile_*,download_url @group Configuration Variables: CFG_* """ __revision__ = "$Id$" import os import re import shutil import filecmp import time import random import socket import urllib2 import urllib import tempfile import cPickle import base64 import binascii import cgi import sys if sys.hexversion < 0x2060000: from md5 import md5 else: from hashlib import md5 # pylint: disable=E0611 try: import magic if hasattr(magic, "open"): CFG_HAS_MAGIC = 1 elif hasattr(magic, "Magic"): CFG_HAS_MAGIC = 2 except ImportError: CFG_HAS_MAGIC = 0 from datetime import datetime from mimetypes import MimeTypes from thread import get_ident from invenio import webinterface_handler_config as apache ## Let's set a reasonable timeout for URL request (e.g. FFT) socket.setdefaulttimeout(40) if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 from invenio.shellutils import escape_shell_arg from invenio.dbquery import run_sql, DatabaseError from invenio.errorlib import register_exception from invenio.bibrecord import record_get_field_instances, \ field_get_subfield_values, field_get_subfield_instances, \ encode_for_xml from invenio.urlutils import create_url, make_user_agent_string from invenio.textutils import nice_size from invenio.access_control_engine import acc_authorize_action from invenio.access_control_admin import acc_is_user_in_role, acc_get_role_id from invenio.access_control_firerole import compile_role_definition, acc_firerole_check_user from invenio.access_control_config import SUPERADMINROLE, CFG_WEBACCESS_WARNING_MSGS from invenio.config import CFG_SITE_URL, \ CFG_WEBDIR, CFG_BIBDOCFILE_FILEDIR,\ CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS, \ CFG_BIBDOCFILE_FILESYSTEM_BIBDOC_GROUP_LIMIT, CFG_SITE_SECURE_URL, \ CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, \ CFG_TMPDIR, CFG_TMPSHAREDDIR, CFG_PATH_MD5SUM, \ CFG_WEBSUBMIT_STORAGEDIR, \ CFG_BIBDOCFILE_USE_XSENDFILE, \ CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY, \ CFG_SITE_RECORD, CFG_PYLIBDIR, \ CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS, \ CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE, \ - CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES + CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES, \ + CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES from invenio.bibdocfile_config import CFG_BIBDOCFILE_ICON_SUBFORMAT_RE, \ CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT from invenio.pluginutils import PluginContainer import invenio.template bibdocfile_templates = invenio.template.load('bibdocfile') ## The above flag controls whether HTTP range requests are supported or not ## when serving static files via Python. This is disabled by default as ## it currently breaks support for opening PDF files on Windows platforms ## using Acrobat reader brower plugin. CFG_ENABLE_HTTP_RANGE_REQUESTS = False #: block size when performing I/O. CFG_BIBDOCFILE_BLOCK_SIZE = 1024 * 8 #: threshold used do decide when to use Python MD5 of CLI MD5 algorithm. CFG_BIBDOCFILE_MD5_THRESHOLD = 256 * 1024 #: chunks loaded by the Python MD5 algorithm. CFG_BIBDOCFILE_MD5_BUFFER = 1024 * 1024 #: whether to normalize e.g. ".JPEG" and ".jpg" into .jpeg. CFG_BIBDOCFILE_STRONG_FORMAT_NORMALIZATION = False #: flags that can be associated to files. CFG_BIBDOCFILE_AVAILABLE_FLAGS = ( 'PDF/A', 'STAMPED', 'PDFOPT', 'HIDDEN', 'CONVERTED', 'PERFORM_HIDE_PREVIOUS', 'OCRED' ) DBG_LOG_QUERIES = False #: constant used if FFT correct with the obvious meaning. KEEP_OLD_VALUE = 'KEEP-OLD-VALUE' _CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [(re.compile(_regex), _headers) for _regex, _headers in CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS] _mimes = MimeTypes(strict=False) _mimes.suffix_map.update({'.tbz2' : '.tar.bz2'}) _mimes.encodings_map.update({'.bz2' : 'bzip2'}) +if CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES: + for key, value in CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES.iteritems(): + _mimes.add_type(key, value) + del key, value + _magic_cookies = {} if CFG_HAS_MAGIC == 1: def _get_magic_cookies(): """ @return: a tuple of magic object. @rtype: (MAGIC_NONE, MAGIC_COMPRESS, MAGIC_MIME, MAGIC_COMPRESS + MAGIC_MIME) @note: ... not real magic. Just see: man file(1) """ thread_id = get_ident() if thread_id not in _magic_cookies: _magic_cookies[thread_id] = { magic.MAGIC_NONE: magic.open(magic.MAGIC_NONE), magic.MAGIC_COMPRESS: magic.open(magic.MAGIC_COMPRESS), magic.MAGIC_MIME: magic.open(magic.MAGIC_MIME), magic.MAGIC_COMPRESS + magic.MAGIC_MIME: magic.open(magic.MAGIC_COMPRESS + magic.MAGIC_MIME), magic.MAGIC_MIME_TYPE: magic.open(magic.MAGIC_MIME_TYPE), } for key in _magic_cookies[thread_id].keys(): _magic_cookies[thread_id][key].load() return _magic_cookies[thread_id] elif CFG_HAS_MAGIC == 2: def _magic_wrapper(local_path, mime=True, mime_encoding=False): thread_id = get_ident() if (thread_id, mime, mime_encoding) not in _magic_cookies: magic_object = _magic_cookies[thread_id, mime, mime_encoding] = magic.Magic(mime=mime, mime_encoding=mime_encoding) else: magic_object = _magic_cookies[thread_id, mime, mime_encoding] return magic_object.from_file(local_path) # pylint: disable=E1103 def _generate_extensions(): """ Generate the regular expression to match all the known extensions. @return: the regular expression. @rtype: regular expression object """ _tmp_extensions = _mimes.encodings_map.keys() + \ _mimes.suffix_map.keys() + \ _mimes.types_map[1].keys() + \ CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS extensions = [] for ext in _tmp_extensions: if ext.startswith('.'): extensions.append(ext) else: extensions.append('.' + ext) extensions.sort() extensions.reverse() extensions = set([ext.lower() for ext in extensions]) extensions = '\\' + '$|\\'.join(extensions) + '$' extensions = extensions.replace('+', '\\+') return re.compile(extensions, re.I) #: Regular expression to recognized extensions. _extensions = _generate_extensions() class InvenioBibDocFileError(Exception): """ Exception raised in case of errors related to fulltext files. """ pass class InvenioBibdocfileUnauthorizedURL(InvenioBibDocFileError): """ Exception raised in case of errors related to fulltext files. """ ## NOTE: this is a legacy Exception pass def _val_or_null(val, eq_name = None, q_str = None, q_args = None): """ Auxiliary function helpful while building WHERE clauses of SQL queries that should contain field=val or field is val If optional parameters q_str and q_args are provided, lists are updated if val == None, a statement of the form "eq_name is Null" is returned otherwise, otherwise the function returns a parametrised comparison "eq_name=%s" with val as an argument added to the query args list. Using parametrised queries diminishes the likelihood of having SQL injection. @param val Value to compare with @type val @param eq_name The name of the database column @type eq_name string @param q_str Query string builder - list of clauses that should be connected by AND operator @type q_str list @param q_args Query arguments list. This list will be applied as a second argument of run_sql command @type q_args list @result string of a single part of WHERE clause @rtype string """ res = "" if eq_name != None: res += eq_name if val == None: if eq_name != None: res += " is " res += "NULL" if q_str != None: q_str.append(res) return res else: if eq_name != None: res += "=" res += "%s" if q_str != None: q_str.append(res) if q_args != None: q_args.append(str(val)) return res def _sql_generate_conjunctive_where(to_process): """Generating WHERE clause of a SQL statement, consisting of conjunction of declared terms. Terms are defined by the to_process argument. the method creates appropriate entries different in the case, value should be NULL (None in the list) and in the case of not-none arguments. In the second case, parametrised query is generated decreasing the chance of an SQL-injection. @param to_process List of tuples (value, database_column) @type to_process list""" q_str = [] q_args = [] for entry in to_process: q_str.append(_val_or_null(entry[0], eq_name = entry[1], q_args = q_args)) return (" AND ".join(q_str), q_args) def file_strip_ext(afile, skip_version=False, only_known_extensions=False, allow_subformat=True): """ Strip in the best way the extension from a filename. >>> file_strip_ext("foo.tar.gz") 'foo' >>> file_strip_ext("foo.buz.gz") 'foo.buz' >>> file_strip_ext("foo.buz") 'foo' >>> file_strip_ext("foo.buz", only_known_extensions=True) 'foo.buz' >>> file_strip_ext("foo.buz;1", skip_version=False, ... only_known_extensions=True) 'foo.buz;1' >>> file_strip_ext("foo.gif;icon") 'foo' >>> file_strip_ext("foo.gif:icon", allow_subformat=False) 'foo.gif:icon' @param afile: the path/name of a file. @type afile: string @param skip_version: whether to skip a trailing ";version". @type skip_version: bool @param only_known_extensions: whether to strip out only known extensions or to consider as extension anything that follows a dot. @type only_known_extensions: bool @param allow_subformat: whether to consider also subformats as part of the extension. @type allow_subformat: bool @return: the name/path without the extension (and version). @rtype: string """ if skip_version or allow_subformat: afile = afile.split(';')[0] nextfile = _extensions.sub('', afile) if nextfile == afile and not only_known_extensions: nextfile = os.path.splitext(afile)[0] while nextfile != afile: afile = nextfile nextfile = _extensions.sub('', afile) return nextfile def normalize_format(docformat, allow_subformat=True): """ Normalize the format, e.g. by adding a dot in front. @param format: the format/extension to be normalized. @type format: string @param allow_subformat: whether to consider also subformats as part of the extension. @type allow_subformat: bool @return: the normalized format. @rtype; string """ if allow_subformat: subformat = docformat[docformat.rfind(';'):] docformat = docformat[:docformat.rfind(';')] else: subformat = '' if docformat and docformat[0] != '.': docformat = '.' + docformat if CFG_BIBDOCFILE_STRONG_FORMAT_NORMALIZATION: if docformat not in ('.Z', '.H', '.C', '.CC'): docformat = docformat.lower() docformat = { '.jpg' : '.jpeg', '.htm' : '.html', '.tif' : '.tiff' }.get(docformat, docformat) return docformat + subformat def guess_format_from_url(url): """ Given a URL tries to guess it's extension. Different method will be used, including HTTP HEAD query, downloading the resource and using mime @param url: the URL for which the extension shuld be guessed. @type url: string @return: the recognized extension or '.bin' if it's impossible to recognize it. @rtype: string """ def guess_via_magic(local_path): try: if CFG_HAS_MAGIC == 1: magic_cookie = _get_magic_cookies()[magic.MAGIC_MIME_TYPE] mimetype = magic_cookie.file(local_path) elif CFG_HAS_MAGIC == 2: mimetype = _magic_wrapper(local_path, mime=True, mime_encoding=False) if CFG_HAS_MAGIC: ext = _mimes.guess_extension(mimetype) if ext: ## Normalize some common magic mis-interpreation ext = {'.asc': '.txt', '.obj': '.bin'}.get(ext, ext) return normalize_format(ext) except Exception: pass ## Let's try to guess the extension by considering the URL as a filename ext = decompose_file(url, skip_version=True, only_known_extensions=True)[2] if ext.startswith('.'): return ext if is_url_a_local_file(url): ## The URL corresponds to a local file, so we can safely consider ## traditional extensions after the dot. ext = decompose_file(url, skip_version=True, only_known_extensions=False)[2] if ext.startswith('.'): return ext ## No extensions? Let's use Magic. ext = guess_via_magic(url) if ext: return ext else: ## Since the URL is remote, let's try to perform a HEAD request ## and see the corresponding headers try: response = open_url(url, head_request=True) except (InvenioBibdocfileUnauthorizedURL, urllib2.URLError): return ".bin" ext = get_format_from_http_response(response) if ext: return ext if CFG_HAS_MAGIC: ## Last solution: let's download the remote resource ## and use the Python magic library to guess the extension filename = "" try: try: filename = download_url(url, docformat='') ext = guess_via_magic(filename) if ext: return ext except Exception: pass finally: if os.path.exists(filename): ## Let's free space os.remove(filename) return ".bin" _docname_re = re.compile(r'[^-\w.]*') def normalize_docname(docname): """ Normalize the docname. At the moment the normalization is just returning the same string. @param docname: the docname to be normalized. @type docname: string @return: the normalized docname. @rtype: string """ #return _docname_re.sub('', docname) return docname def normalize_version(version): """ Normalize the version. The version can be either an integer or the keyword 'all'. Any other value will be transformed into the empty string. @param version: the version (either a number or 'all'). @type version: integer or string @return: the normalized version. @rtype: string """ try: int(version) except ValueError: if version.lower().strip() == 'all': return 'all' else: return '' return str(version) def compose_file(dirname, extension, subformat=None, version=None, storagename=None): """ Construct back a fullpath given the separate components. @param @param storagename Name under which the file should be stored in the filesystem @type storagename string @return a fullpath to the file @rtype string """ if version: version = ";%i" % int(version) else: version = "" if subformat: if not subformat.startswith(";"): subformat = ";%s" % subformat else: subformat = "" if extension and not extension.startswith("."): extension = ".%s" % extension if not storagename: storagename = "content" return os.path.join(dirname, storagename + extension + subformat + version) def compose_format(extension, subformat=None): """ Construct the format string """ if not extension.startswith("."): extension = ".%s" % extension if subformat: if not subformat.startswith(";"): subformat = ";%s" % subformat else: subformat = "" return extension + subformat def decompose_file(afile, skip_version=False, only_known_extensions=False, allow_subformat=True): """ Decompose a file/path into its components dirname, basename and extension. >>> decompose_file('/tmp/foo.tar.gz') ('/tmp', 'foo', '.tar.gz') >>> decompose_file('/tmp/foo.tar.gz;1', skip_version=True) ('/tmp', 'foo', '.tar.gz') >>> decompose_file('http://www.google.com/index.html') ('http://www.google.com', 'index', '.html') @param afile: the path/name of a file. @type afile: string @param skip_version: whether to skip a trailing ";version". @type skip_version: bool @param only_known_extensions: whether to strip out only known extensions or to consider as extension anything that follows a dot. @type only_known_extensions: bool @param allow_subformat: whether to consider also subformats as part of the extension. @type allow_subformat: bool @return: a tuple with the directory name, the basename and extension. @rtype: (dirname, basename, extension) @note: if a URL is provided, the scheme will be part of the dirname. @see: L{file_strip_ext} for the algorithm used to retrieve the extension. """ if skip_version: version = afile.split(';')[-1] try: int(version) afile = afile[:-len(version)-1] except ValueError: pass basename = os.path.basename(afile) dirname = afile[:-len(basename)-1] base = file_strip_ext( basename, only_known_extensions=only_known_extensions, allow_subformat=allow_subformat) extension = basename[len(base) + 1:] if extension: extension = '.' + extension return (dirname, base, extension) def decompose_file_with_version(afile): """ Decompose a file into dirname, basename, extension and version. >>> decompose_file_with_version('/tmp/foo.tar.gz;1') ('/tmp', 'foo', '.tar.gz', 1) @param afile: the path/name of a file. @type afile: string @return: a tuple with the directory name, the basename, extension and version. @rtype: (dirname, basename, extension, version) @raise ValueError: in case version does not exist it will. @note: if a URL is provided, the scheme will be part of the dirname. """ version_str = afile.split(';')[-1] version = int(version_str) afile = afile[:-len(version_str)-1] basename = os.path.basename(afile) dirname = afile[:-len(basename)-1] base = file_strip_ext(basename) extension = basename[len(base) + 1:] if extension: extension = '.' + extension return (dirname, base, extension, version) def get_subformat_from_format(docformat): """ @return the subformat if any. @rtype: string >>> get_subformat_from_format('foo;bar') 'bar' >>> get_subformat_from_format('foo') '' """ try: return docformat[docformat.rindex(';') + 1:] except ValueError: return '' def get_superformat_from_format(docformat): """ @return the superformat if any. @rtype: string >>> get_superformat_from_format('foo;bar') 'foo' >>> get_superformat_from_format('foo') 'foo' """ try: return docformat[:docformat.rindex(';')] except ValueError: return docformat def propose_next_docname(docname): """ Given a I{docname}, suggest a new I{docname} (useful when trying to generate a unique I{docname}). >>> propose_next_docname('foo') 'foo_1' >>> propose_next_docname('foo_1') 'foo_2' >>> propose_next_docname('foo_10') 'foo_11' @param docname: the base docname. @type docname: string @return: the next possible docname based on the given one. @rtype: string """ if '_' in docname: split_docname = docname.split('_') try: split_docname[-1] = str(int(split_docname[-1]) + 1) docname = '_'.join(split_docname) except ValueError: docname += '_1' else: docname += '_1' return docname class BibRecDocs(object): """ This class represents all the files attached to one record. @param recid: the record identifier. @type recid: integer @param deleted_too: whether to consider deleted documents as normal documents (useful when trying to recover deleted information). @type deleted_too: bool @param human_readable: whether numbers should be printed in human readable format (e.g. 2048 bytes -> 2Kb) @ivar id: the record identifier as passed to the constructor. @type id: integer @ivar human_readable: the human_readable flag as passed to the constructor. @type human_readable: bool @ivar deleted_too: the deleted_too flag as passed to the constructor. @type deleted_too: bool @ivar bibdocs: the list of documents attached to the record. @type bibdocs: list of BibDoc """ def __init__(self, recid, deleted_too=False, human_readable=False): try: self.id = int(recid) except ValueError: raise ValueError("BibRecDocs: recid is %s but must be an integer." % repr(recid)) self.human_readable = human_readable self.deleted_too = deleted_too self.bibdocs = {} self.attachment_types = {} # dictionary docname->attachment type self.build_bibdoc_list() def __repr__(self): """ @return: the canonical string representation of the C{BibRecDocs}. @rtype: string """ return 'BibRecDocs(%s%s%s)' % (self.id, self.deleted_too and ', True' or '', self.human_readable and ', True' or '' ) def __str__(self): """ @return: an easy to be I{grepped} string representation of the whole C{BibRecDocs} content. @rtype: string """ out = '%i::::total bibdocs attached=%i\n' % (self.id, len(self.bibdocs)) out += '%i::::total size latest version=%s\n' % (self.id, nice_size(self.get_total_size_latest_version())) out += '%i::::total size all files=%s\n' % (self.id, nice_size(self.get_total_size())) for (docname, (bibdoc, dummy)) in self.bibdocs.items(): out += str(docname) + ":" + str(bibdoc) return out def empty_p(self): """ @return: True when the record has no attached documents. @rtype: bool """ return len(self.bibdocs) == 0 def deleted_p(self): """ @return: True if the correxsponding record has been deleted. @rtype: bool """ from invenio.search_engine import record_exists return record_exists(self.id) == -1 def get_xml_8564(self): """ Return a snippet of I{MARCXML} representing the I{8564} fields corresponding to the current state. @return: the MARCXML representation. @rtype: string """ from invenio.search_engine import get_record out = '' record = get_record(self.id) fields = record_get_field_instances(record, '856', '4', ' ') for field in fields: urls = field_get_subfield_values(field, 'u') if urls and not bibdocfile_url_p(urls[0]): out += '\t\n' for subfield, value in field_get_subfield_instances(field): out += '\t\t%s\n' % (subfield, encode_for_xml(value)) out += '\t\n' for afile in self.list_latest_files(list_hidden=False): out += '\t\n' url = afile.get_url() description = afile.get_description() comment = afile.get_comment() if url: out += '\t\t%s\n' % encode_for_xml(url) if description: out += '\t\t%s\n' % encode_for_xml(description) if comment: out += '\t\t%s\n' % encode_for_xml(comment) out += '\t\n' return out def get_total_size_latest_version(self): """ Returns the total size used on disk by all the files belonging to this record and corresponding to the latest version. @return: the total size. @rtype: integer """ size = 0 for (bibdoc, _) in self.bibdocs.values(): size += bibdoc.get_total_size_latest_version() return size def get_total_size(self): """ Return the total size used on disk of all the files belonging to this record of any version (not only the last as in L{get_total_size_latest_version}). @return: the total size. @rtype: integer """ size = 0 for (bibdoc, _) in self.bibdocs.values(): size += bibdoc.get_total_size() return size def build_bibdoc_list(self): """ This method must be called everytime a I{bibdoc} is added, removed or modified. """ self.bibdocs = {} if self.deleted_too: res = run_sql("""SELECT brbd.id_bibdoc, brbd.docname, brbd.type FROM bibrec_bibdoc as brbd JOIN bibdoc as bd ON bd.id=brbd.id_bibdoc WHERE brbd.id_bibrec=%s ORDER BY brbd.docname ASC""", (self.id,)) else: res = run_sql("""SELECT brbd.id_bibdoc, brbd.docname, brbd.type FROM bibrec_bibdoc as brbd JOIN bibdoc as bd ON bd.id=brbd.id_bibdoc WHERE brbd.id_bibrec=%s AND bd.status<>'DELETED' ORDER BY brbd.docname ASC""", (self.id,)) for row in res: cur_doc = BibDoc.create_instance(docid=row[0], recid=self.id, human_readable=self.human_readable) self.bibdocs[row[1]] = (cur_doc, row[2]) def list_bibdocs_by_names(self, doctype=None): """ Returns the dictionary of all bibdocs object belonging to a recid. Keys in the dictionary are names of documetns and values are BibDoc objects. If C{doctype} is set, it returns just the bibdocs of that doctype. @param doctype: the optional doctype. @type doctype: string @return: the dictionary of bibdocs. @rtype: dictionary of Dcname -> BibDoc """ if not doctype: return dict((k,v) for (k,(v,_)) in self.bibdocs.iteritems()) res = {} for (docname, (doc, attachmenttype)) in self.bibdocs.iteritems(): if attachmenttype == doctype: res[docname] = doc return res def list_bibdocs(self, doctype=None): """ Returns the list all bibdocs object belonging to a recid. If C{doctype} is set, it returns just the bibdocs of that doctype. @param doctype: the optional doctype. @type doctype: string @return: the list of bibdocs. @rtype: list of BibDoc """ if not doctype: return [d for (d,_) in self.bibdocs.values()] else: return [bibdoc for (bibdoc, attype) in self.bibdocs.values() if doctype == attype] def get_bibdoc_names(self, doctype=None): """ Returns all the names of the documents associated with the bibrec. If C{doctype} is set, restrict the result to all the matching doctype. @param doctype: the optional doctype. @type doctype: string @return: the list of document names. @rtype: list of string """ return [docname for (docname, dummy) in self.list_bibdocs_by_names(doctype).items()] def check_file_exists(self, path, f_format): """ Check if a file with the same content of the file pointed in C{path} is already attached to this record. @param path: the file to be checked against. @type path: string @return: True if a file with the requested content is already attached to the record. @rtype: bool """ size = os.path.getsize(path) # Let's consider all the latest files files = self.list_latest_files() # Let's consider all the latest files with same size potential = [afile for afile in files if afile.get_size() == size and afile.format == f_format] if potential: checksum = calculate_md5(path) # Let's consider all the latest files with the same size and the # same checksum potential = [afile for afile in potential if afile.get_checksum() == checksum] if potential: potential = [afile for afile in potential if \ filecmp.cmp(afile.get_full_path(), path)] if potential: return True else: # Gosh! How unlucky, same size, same checksum but not same # content! pass return False def propose_unique_docname(self, docname): """ Given C{docname}, return a new docname that is not already attached to the record. @param docname: the reference docname. @type docname: string @return: a docname not already attached. @rtype: string """ docname = normalize_docname(docname) goodname = docname i = 1 while goodname in self.get_bibdoc_names(): i += 1 goodname = "%s_%s" % (docname, i) return goodname def merge_bibdocs(self, docname1, docname2): """ This method merge C{docname2} into C{docname1}. 1. Given all the formats of the latest version of the files attached to C{docname2}, these files are added as new formats into C{docname1}. 2. C{docname2} is marked as deleted. @raise InvenioBibDocFileError: if at least one format in C{docname2} already exists in C{docname1}. (In this case the two bibdocs are preserved) @note: comments and descriptions are also copied. @note: if C{docname2} has a I{restriction}(i.e. if the I{status} is set) and C{docname1} doesn't, the restriction is imported. """ bibdoc1 = self.get_bibdoc(docname1) bibdoc2 = self.get_bibdoc(docname2) ## Check for possibility for bibdocfile in bibdoc2.list_latest_files(): docformat = bibdocfile.get_format() if bibdoc1.format_already_exists_p(docformat): raise InvenioBibDocFileError('Format %s already exists in bibdoc %s of record %s. It\'s impossible to merge bibdoc %s into it.' % (docformat, docname1, self.id, docname2)) ## Importing restriction if needed. restriction1 = bibdoc1.get_status() restriction2 = bibdoc2.get_status() if restriction2 and not restriction1: bibdoc1.set_status(restriction2) ## Importing formats for bibdocfile in bibdoc2.list_latest_files(): docformat = bibdocfile.get_format() comment = bibdocfile.get_comment() description = bibdocfile.get_description() bibdoc1.add_file_new_format(bibdocfile.get_full_path(), description=description, comment=comment, docformat=docformat) ## Finally deleting old bibdoc2 bibdoc2.delete() self.build_bibdoc_list() def get_docid(self, docname): """ @param docname: the document name. @type docname: string @return: the identifier corresponding to the given C{docname}. @rtype: integer @raise InvenioBibDocFileError: if the C{docname} does not corresponds to a document attached to this record. """ if docname in self.bibdocs: return self.bibdocs[docname][0].id raise InvenioBibDocFileError, "Recid '%s' is not connected with a " \ "docname '%s'" % (self.id, docname) def get_docname(self, docid): """ @param docid: the document identifier. @type docid: integer @return: the name of the document corresponding to the given document identifier. @rtype: string @raise InvenioBibDocFileError: if the C{docid} does not corresponds to a document attached to this record. """ for (docname, (bibdoc, _)) in self.bibdocs.items(): if bibdoc.id == docid: return docname raise InvenioBibDocFileError, "Recid '%s' is not connected with a " \ "docid '%s'" % (self.id, docid) def change_name(self, newname, oldname=None, docid=None): """ Renames document of a given name. @param newname: the new name. @type newname: string @raise InvenioBibDocFileError: if the new name corresponds to a document already attached to the record owning this document. """ if not oldname and not docid: raise StandardError("Trying to rename unspecified document") if not oldname: oldname = self.get_docname(docid) if not docid: docid = self.get_docid(oldname) doc, atttype = self.bibdocs[oldname] try: newname = normalize_docname(newname) res = run_sql("SELECT id_bibdoc FROM bibrec_bibdoc WHERE id_bibrec=%s AND docname=%s", (self.id, newname)) if res: raise InvenioBibDocFileError, "A bibdoc called %s already exists for recid %s" % (newname, self.id) run_sql("update bibrec_bibdoc set docname=%s where id_bibdoc=%s and id_bibrec=%s", (newname, docid, self.id)) finally: # updating the document for a in doc.bibrec_links: if a["recid"] == self.id: a["docname"] = newname # updating the record structure del self.bibdocs[oldname] self.bibdocs[newname] = (doc, atttype) def has_docname_p(self, docname): """ @param docname: the document name, @type docname: string @return: True if a document with the given name is attached to this record. @rtype: bool """ return docname in self.bibdocs.keys() def get_bibdoc(self, docname): """ @return: the bibdoc with a particular docname associated with this recid""" if docname in self.bibdocs: return self.bibdocs[docname][0] raise InvenioBibDocFileError, "Recid '%s' is not connected with " \ " docname '%s'" % (self.id, docname) def delete_bibdoc(self, docname): """ Deletes the document with the specified I{docname}. @param docname: the document name. @type docname: string """ if docname in self.bibdocs: self.bibdocs[docname][0].delete() self.build_bibdoc_list() def add_bibdoc(self, doctype="Main", docname='file', never_fail=False): """ Add a new empty document object (a I{bibdoc}) to the list of documents of this record. @param doctype: the document type. @type doctype: string @param docname: the document name. @type docname: string @param never_fail: if True, this procedure will not fail, even if a document with the given name is already attached to this record. In this case a new name will be generated (see L{propose_unique_docname}). @type never_fail: bool @return: the newly created document object. @rtype: BibDoc @raise InvenioBibDocFileError: in case of any error. """ try: docname = normalize_docname(docname) if never_fail: docname = self.propose_unique_docname(docname) if docname in self.get_bibdoc_names(): raise InvenioBibDocFileError, \ "%s has already a bibdoc with docname %s" % (self.id, docname) else: bibdoc = BibDoc.create_instance(recid=self.id, doctype=doctype, docname=docname, human_readable=self.human_readable) self.build_bibdoc_list() return bibdoc except Exception, e: register_exception() raise InvenioBibDocFileError(str(e)) def add_new_file(self, fullpath, doctype="Main", docname=None, never_fail=False, description=None, comment=None, docformat=None, flags=None, modification_date=None): """ Directly add a new file to this record. Adds a new file with the following policy: - if the C{docname} is not set it is retrieved from the name of the file. - If a bibdoc with the given docname doesn't already exist, it is created and the file is added to it. - It it exist but it doesn't contain the format that is being added, the new format is added. - If the format already exists then if C{never_fail} is True a new bibdoc is created with a similar name but with a progressive number as a suffix and the file is added to it (see L{propose_unique_docname}). @param fullpath: the filesystme path of the document to be added. @type fullpath: string @param doctype: the type of the document. @type doctype: string @param docname: the document name. @type docname: string @param never_fail: if True, this procedure will not fail, even if a document with the given name is already attached to this record. In this case a new name will be generated (see L{propose_unique_docname}). @type never_fail: bool @param description: an optional description of the file. @type description: string @param comment: an optional comment to the file. @type comment: string @param format: the extension of the file. If not specified it will be guessed (see L{guess_format_from_url}). @type format: string @param flags: a set of flags to be associated with the file (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}) @type flags: list of string @return: the elaborated document object. @rtype: BibDoc @raise InvenioBibDocFileError: in case of error. """ if docname is None: docname = decompose_file(fullpath)[1] if docformat is None: docformat = decompose_file(fullpath)[2] docname = normalize_docname(docname) try: bibdoc = self.get_bibdoc(docname) except InvenioBibDocFileError: # bibdoc doesn't already exists! bibdoc = self.add_bibdoc(doctype, docname, False) bibdoc.add_file_new_version(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date) self.build_bibdoc_list() else: try: bibdoc.add_file_new_format(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date) self.build_bibdoc_list() except InvenioBibDocFileError, dummy: # Format already exist! if never_fail: bibdoc = self.add_bibdoc(doctype, docname, True) bibdoc.add_file_new_version(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date) self.build_bibdoc_list() else: raise return bibdoc def add_new_version(self, fullpath, docname=None, description=None, comment=None, docformat=None, flags=None): """ Adds a new file to an already existent document object as a new version. @param fullpath: the filesystem path of the file to be added. @type fullpath: string @param docname: the document name. If not specified it will be extracted from C{fullpath} (see L{decompose_file}). @type docname: string @param description: an optional description for the file. @type description: string @param comment: an optional comment to the file. @type comment: string @param format: the extension of the file. If not specified it will be guessed (see L{guess_format_from_url}). @type format: string @param flags: a set of flags to be associated with the file (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}) @type flags: list of string @return: the elaborated document object. @rtype: BibDoc @raise InvenioBibDocFileError: in case of error. @note: previous files associated with the same document will be considered obsolete. """ if docname is None: docname = decompose_file(fullpath)[1] if docformat is None: docformat = decompose_file(fullpath)[2] if flags is None: flags = [] if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags: flags.append('PDF/A') bibdoc = self.get_bibdoc(docname=docname) bibdoc.add_file_new_version(fullpath, description=description, comment=comment, docformat=docformat, flags=flags) self.build_bibdoc_list() return bibdoc def add_new_format(self, fullpath, docname=None, description=None, comment=None, docformat=None, flags=None, modification_date=None): """ Adds a new file to an already existent document object as a new format. @param fullpath: the filesystem path of the file to be added. @type fullpath: string @param docname: the document name. If not specified it will be extracted from C{fullpath} (see L{decompose_file}). @type docname: string @param description: an optional description for the file. @type description: string @param comment: an optional comment to the file. @type comment: string @param format: the extension of the file. If not specified it will be guessed (see L{guess_format_from_url}). @type format: string @param flags: a set of flags to be associated with the file (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}) @type flags: list of string @return: the elaborated document object. @rtype: BibDoc @raise InvenioBibDocFileError: in case the same format already exists. """ if docname is None: docname = decompose_file(fullpath)[1] if docformat is None: docformat = decompose_file(fullpath)[2] if flags is None: flags = [] if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags: flags.append('PDF/A') bibdoc = self.get_bibdoc(docname=docname) bibdoc.add_file_new_format(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date) self.build_bibdoc_list() return bibdoc def list_latest_files(self, doctype='', list_hidden=True): """ Returns a list of the latest files. @param doctype: if set, only document of the given type will be listed. @type doctype: string @param list_hidden: if True, will list also files with the C{HIDDEN} flag being set. @type list_hidden: bool @return: the list of latest files. @rtype: list of BibDocFile """ docfiles = [] for bibdoc in self.list_bibdocs(doctype): docfiles += bibdoc.list_latest_files(list_hidden=list_hidden) return docfiles def fix(self, docname): """ Algorithm that transform a broken/old bibdoc into a coherent one. Think of it as being the fsck of BibDocs. - All the files in the bibdoc directory will be renamed according to the document name. Proper .recid, .type, .md5 files will be created/updated. - In case of more than one file with the same format version a new bibdoc will be created in order to put does files. @param docname: the document name that need to be fixed. @type docname: string @return: the list of newly created bibdocs if any. @rtype: list of BibDoc @raise InvenioBibDocFileError: in case of issues that can not be fixed automatically. """ bibdoc = self.get_bibdoc(docname) versions = {} res = [] new_bibdocs = [] # List of files with the same version/format of # existing file which need new bibdoc. counter = 0 zero_version_bug = False if os.path.exists(bibdoc.basedir): for filename in os.listdir(bibdoc.basedir): if filename[0] != '.' and ';' in filename: name, version = filename.split(';') try: version = int(version) except ValueError: # Strange name register_exception() raise InvenioBibDocFileError, "A file called %s exists under %s. This is not a valid name. After the ';' there must be an integer representing the file version. Please, manually fix this file either by renaming or by deleting it." % (filename, bibdoc.basedir) if version == 0: zero_version_bug = True docformat = name[len(file_strip_ext(name)):] docformat = normalize_format(docformat) if not versions.has_key(version): versions[version] = {} new_name = 'FIXING-%s-%s' % (str(counter), name) try: shutil.move('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, new_name)) except Exception, e: register_exception() raise InvenioBibDocFileError, "Error in renaming '%s' to '%s': '%s'" % ('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, new_name), e) if versions[version].has_key(docformat): new_bibdocs.append((new_name, version)) else: versions[version][docformat] = new_name counter += 1 elif filename[0] != '.': # Strange name register_exception() raise InvenioBibDocFileError, "A file called %s exists under %s. This is not a valid name. There should be a ';' followed by an integer representing the file version. Please, manually fix this file either by renaming or by deleting it." % (filename, bibdoc.basedir) else: # we create the corresponding storage directory old_umask = os.umask(022) os.makedirs(bibdoc.basedir) # and save the father record id if it exists try: if self.id != "": recid_fd = open("%s/.recid" % bibdoc.basedir, "w") recid_fd.write(str(self.id)) recid_fd.close() if bibdoc.doctype != "": type_fd = open("%s/.type" % bibdoc.basedir, "w") type_fd.write(str(bibdoc.doctype)) type_fd.close() except Exception, e: register_exception() raise InvenioBibDocFileError, e os.umask(old_umask) if not versions: bibdoc.delete() else: for version, formats in versions.iteritems(): if zero_version_bug: version += 1 for docformat, filename in formats.iteritems(): destination = '%s%s;%i' % (docname, docformat, version) try: shutil.move('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, destination)) except Exception, e: register_exception() raise InvenioBibDocFileError, "Error in renaming '%s' to '%s': '%s'" % ('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, destination), e) try: recid_fd = open("%s/.recid" % bibdoc.basedir, "w") recid_fd.write(str(self.id)) recid_fd.close() type_fd = open("%s/.type" % bibdoc.basedir, "w") type_fd.write(str(bibdoc.doctype)) type_fd.close() except Exception, e: register_exception() raise InvenioBibDocFileError, "Error in creating .recid and .type file for '%s' folder: '%s'" % (bibdoc.basedir, e) self.build_bibdoc_list() res = [] for (filename, version) in new_bibdocs: if zero_version_bug: version += 1 new_bibdoc = self.add_bibdoc(doctype=bibdoc.doctype, docname=docname, never_fail=True) new_bibdoc.add_file_new_format('%s/%s' % (bibdoc.basedir, filename), version) res.append(new_bibdoc) try: os.remove('%s/%s' % (bibdoc.basedir, filename)) except Exception, e: register_exception() raise InvenioBibDocFileError, "Error in removing '%s': '%s'" % ('%s/%s' % (bibdoc.basedir, filename), e) Md5Folder(bibdoc.basedir).update(only_new=False) bibdoc._build_file_list() self.build_bibdoc_list() for (bibdoc, dummyatttype) in self.bibdocs.values(): if not run_sql('SELECT data_value FROM bibdocmoreinfo WHERE bibdocid=%s', (bibdoc.id,)): ## Import from MARC only if the bibdoc has never had ## its more_info initialized. try: bibdoc.import_descriptions_and_comments_from_marc() except Exception, e: register_exception() raise InvenioBibDocFileError, "Error in importing description and comment from %s for record %s: %s" % (repr(bibdoc), self.id, e) return res def check_format(self, docname): """ Check for any format related issue. In case L{CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS} is altered or Python version changes, it might happen that a docname contains files which are no more docname + .format ; version, simply because the .format is now recognized (and it was not before, so it was contained into the docname). This algorithm verify if it is necessary to fix (seel L{fix_format}). @param docname: the document name whose formats should be verified. @type docname: string @return: True if format is correct. False if a fix is needed. @rtype: bool @raise InvenioBibDocFileError: in case of any error. """ bibdoc = self.get_bibdoc(docname) correct_docname = decompose_file(docname + '.pdf')[1] if docname != correct_docname: return False for filename in os.listdir(bibdoc.basedir): if not filename.startswith('.'): try: dummy, dummy, docformat, version = decompose_file_with_version(filename) except Exception: raise InvenioBibDocFileError('Incorrect filename "%s" for docname %s for recid %i' % (filename, docname, self.id)) if '%s%s;%i' % (correct_docname, docformat, version) != filename: return False return True def check_duplicate_docnames(self): """ Check wethever the record is connected with at least tho documents with the same name. @return: True if everything is fine. @rtype: bool """ docnames = set() for docname in self.get_bibdoc_names(): if docname in docnames: return False else: docnames.add(docname) return True def uniformize_bibdoc(self, docname): """ This algorithm correct wrong file name belonging to a bibdoc. @param docname: the document name whose formats should be verified. @type docname: string """ bibdoc = self.get_bibdoc(docname) for filename in os.listdir(bibdoc.basedir): if not filename.startswith('.'): try: dummy, dummy, docformat, version = decompose_file_with_version(filename) except ValueError: register_exception(alert_admin=True, prefix= "Strange file '%s' is stored in %s" % (filename, bibdoc.basedir)) else: os.rename(os.path.join(bibdoc.basedir, filename), os.path.join(bibdoc.basedir, '%s%s;%i' % (docname, docformat, version))) Md5Folder(bibdoc.basedir).update() bibdoc.touch() bibdoc._build_file_list('rename') def fix_format(self, docname, skip_check=False): """ Fixes format related inconsistencies. @param docname: the document name whose formats should be verified. @type docname: string @param skip_check: if True assume L{check_format} has already been called and the need for fix has already been found. If False, will implicitly call L{check_format} and skip fixing if no error is found. @type skip_check: bool @return: in case merging two bibdocs is needed but it's not possible. @rtype: bool """ if not skip_check: if self.check_format(docname): return True bibdoc = self.get_bibdoc(docname) correct_docname = decompose_file(docname + '.pdf')[1] need_merge = False if correct_docname != docname: need_merge = self.has_docname_p(correct_docname) if need_merge: proposed_docname = self.propose_unique_docname(correct_docname) run_sql('UPDATE bibdoc SET docname=%s WHERE id=%s', (proposed_docname, bibdoc.id)) self.build_bibdoc_list() self.uniformize_bibdoc(proposed_docname) try: self.merge_bibdocs(docname, proposed_docname) except InvenioBibDocFileError: return False else: run_sql('UPDATE bibdoc SET docname=%s WHERE id=%s', (correct_docname, bibdoc.id)) self.build_bibdoc_list() self.uniformize_bibdoc(correct_docname) else: self.uniformize_bibdoc(docname) return True def fix_duplicate_docnames(self, skip_check=False): """ Algotirthm to fix duplicate docnames. If a record is connected with at least two bibdoc having the same docname, the algorithm will try to merge them. @param skip_check: if True assume L{check_duplicate_docnames} has already been called and the need for fix has already been found. If False, will implicitly call L{check_duplicate_docnames} and skip fixing if no error is found. @type skip_check: bool """ if not skip_check: if self.check_duplicate_docnames(): return docnames = set() for bibdoc in self.list_bibdocs(): docname = self.get_docname(bibdoc.id) if docname in docnames: new_docname = self.propose_unique_docname(self.get_docname(bibdoc.id)) self.change_name(docid=bibdoc.id, newname=new_docname) self.merge_bibdocs(docname, new_docname) docnames.add(docname) def get_text(self, extract_text_if_necessary=True): """ @return: concatenated texts of all bibdocs separated by " ": string """ texts = [] for bibdoc in self.list_bibdocs(): if hasattr(bibdoc, 'has_text'): if extract_text_if_necessary and not bibdoc.has_text(require_up_to_date=True): re_perform_ocr = re.compile(CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES) perform_ocr = bool(re_perform_ocr.match(bibdoc.get_docname())) from invenio.bibtask import write_message write_message("... will extract words from %s (docid: %s) %s" % (bibdoc.get_docname(), bibdoc.get_id(), perform_ocr and 'with OCR' or ''), verbose=2) bibdoc.extract_text(perform_ocr=perform_ocr) texts.append(bibdoc.get_text()) return " ".join(texts) class BibDoc(object): """ This class represents one document (i.e. a set of files with different formats and with versioning information that consitutes a piece of information. To instanciate a new document, the recid and the docname are mandatory. To instanciate an already existing document, either the recid and docname or the docid alone are sufficient to retrieve it. @param docid: the document identifier. @type docid: integer @param recid: the record identifier of the record to which this document belongs to. If the C{docid} is specified the C{recid} is automatically retrieven from the database. @type recid: integer @param docname: the document name. @type docname: string @param doctype: the document type (used when instanciating a new document). @type doctype: string @param human_readable: whether sizes should be represented in a human readable format. @type human_readable: bool @raise InvenioBibDocFileError: in case of error. """ @staticmethod def create_new_document(doc_type = "Main", rec_links = []): status = '' doc_id = run_sql("INSERT INTO bibdoc (status, creation_date, modification_date, doctype) " "values(%s,NOW(),NOW(), %s)", (status, doc_type)) if not doc_id: raise InvenioBibDocFileError, "New docid cannot be created" # creating the representation on disk ... preparing the directory try: BibDoc.prepare_basedir(doc_id) except Exception, e: run_sql('DELETE FROM bibdoc WHERE id=%s', (doc_id, )) # run_sql('DELETE FROM bibrec_bibdoc WHERE id_bibdoc=%s', (doc_id, )) register_exception(alert_admin=True) raise InvenioBibDocFileError, e # the object has been created: linking to bibliographical records doc = BibDoc(doc_id) for link in rec_links: if "rec_id" in link and link["rec_id"]: rec_id = link["rec_id"] doc_name = normalize_docname(link["doc_name"]) a_type = link["a_type"] doc.attach_to_record(rec_id, str(a_type), str(doc_name)) return doc_id def __init__(self, docid, human_readable=False, initial_data=None): """Constructor of a bibdoc. At least the docid or the recid/docname pair is needed. specifying recid, docname and doctype without specifying docid results in attaching newly created document to a record """ # docid is known, the document already exists res2 = run_sql("SELECT id_bibrec, type, docname FROM bibrec_bibdoc WHERE id_bibdoc=%s", (docid,)) self.bibrec_types = [(r[0], r[1], r[2]) for r in res2 ] # just in case the result was behaving like tuples but was something else if not res2: # fake attachment self.bibrec_types = [0, None, "fake_name_for_unattached_document"] if initial_data is None: initial_data = BibDoc._retrieve_data(docid) self.docfiles = [] self.__md5s = None self.human_readable = human_readable self.cd = initial_data["cd"] # creation date self.md = initial_data["md"] # modification date self.td = initial_data["td"] # text extraction date # should be moved from here !!!! self.bibrec_links = initial_data["bibrec_links"] self.id = initial_data["id"] self.status = initial_data["status"] self.basedir = initial_data["basedir"] self.doctype = initial_data["doctype"] self.storagename = initial_data["storagename"] # the old docname -> now used as a storage name for old records self.more_info = BibDocMoreInfo(self.id) self._build_file_list('init') # link with related_files self._build_related_file_list() @staticmethod def prepare_basedir(doc_id): """Prepares the directory serving as root of a BibDoc""" basedir = _make_base_dir(doc_id) # we create the corresponding storage directory if not os.path.exists(basedir): old_umask = os.umask(022) os.makedirs(basedir) os.umask(old_umask) def _update_additional_info_files_p(self): """Update the hidden file in the document directory ... the file contains all links to records""" try: reclinks_fd = open("%s/.reclinks" % (self.basedir, ), "w") reclinks_fd.write("RECID DOCNAME TYPE\n") for link in self.bibrec_links: reclinks_fd.write("%(recid)s %(docname)s %(doctype)s\n" % link) reclinks_fd.close() except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError, e @staticmethod def _retrieve_data(docid = None): """ Filling information about a document from the database entry """ container = {} container["bibrec_links"] = [] container["id"] = docid container["basedir"] = _make_base_dir(container["id"]) # retrieving links betwen records and documents res = run_sql("SELECT id_bibrec, type, docname FROM bibrec_bibdoc WHERE id_bibdoc=%s", (str(docid),), 1) if res: for r in res: container["bibrec_links"].append({"recid": r[0], "doctype": r[1], "docname": r[2]}) # gather the other information res = run_sql("SELECT status, creation_date, modification_date, text_extraction_date, doctype, docname FROM bibdoc WHERE id=%s LIMIT 1", (docid,), 1) if res: container["status"] = res[0][0] container["cd"] = res[0][1] container["md"] = res[0][2] container["td"] = res[0][3] container["doctype"] = res[0][4] container["storagename"] = res[0][5] else: # this bibdoc doesn't exist raise InvenioBibDocFileError, "The docid %s does not exist." % docid # retreiving all available formats fprefix = container["storagename"] or "content" container["extensions"] = [fname[len(fprefix):] for fname in filter(lambda x: x.startswith(fprefix),os.listdir(container["basedir"]))] return container @staticmethod def create_instance(docid=None, recid=None, docname=None, doctype='Fulltext', a_type = 'Main', human_readable=False): """ Parameters of an attachement to the record: a_type, recid, docname @param a_type Type of the attachment to the record (by default Main) @type a_type String @param doctype Type of the document itself (by default Fulltext) @type doctype String """ # first try to retrieve existing record based on obtained data data = None extensions = [] if docid != None: data = BibDoc._retrieve_data(docid) doctype = data["doctype"] extensions = data["extensions"] # now check if the doctypype is supported by any particular plugin def plugin_bldr(dummy, plugin_code): """Preparing the plugin dictionary structure""" ret = {} ret['create_instance'] = getattr(plugin_code, "create_instance", None) ret['supports'] = getattr(plugin_code, "supports", None) return ret bibdoc_plugins = PluginContainer( os.path.join(CFG_PYLIBDIR, 'invenio', 'bibdocfile_plugins', 'bom_*.py'), plugin_builder=plugin_bldr) # Loading an appropriate plugin (by default a generic BibDoc) used_plugin = None for dummy, plugin in bibdoc_plugins.iteritems(): if plugin['supports'](doctype, extensions): used_plugin = plugin if not docid: rec_links = [] if recid: rec_links.append({"rec_id": recid, "doc_name" : docname, "a_type": a_type}) if used_plugin and 'create_new' in used_plugin: docid = used_plugin['create_new'](doctype, rec_links) else: docid = BibDoc.create_new_document(doctype, rec_links) if used_plugin: return used_plugin['create_instance'](docid=docid, human_readable=human_readable, initial_data=data) return BibDoc(docid=docid, human_readable=human_readable, initial_data=data) # parameters can not be passed any more @staticmethod def _attach_to_record_p(doc_id, rec_id, a_type, docname): """Private core of a method attaching document of a given ID to a record @param a_type Attachment type (a function in which the document appears in the document) @type a_type String """ run_sql("INSERT INTO bibrec_bibdoc (id_bibrec, id_bibdoc, type, docname) VALUES (%s,%s,%s,%s)", (str(rec_id), str(doc_id), a_type, docname)) def attach_to_record(self, recid, a_type, docname): """ Attaches given document to a record given by its identifier. @param recid The identifier of the record @type recid Integer @param a_type Function of a document in the record @type a_type String @param docname Name of a document inside of a record @type docname String """ run_sql("INSERT INTO bibrec_bibdoc (id_bibrec, id_bibdoc, type, docname) VALUES (%s,%s,%s,%s)", (str(recid), str(self.id), a_type, docname)) self._update_additional_info_files_p() def __repr__(self): """ @return: the canonical string representation of the C{BibDoc}. @rtype: string """ return 'BibDoc(%s, %s, %s)' % (repr(self.id), repr(self.doctype), repr(self.human_readable)) def format_recids(self): """Returns a string representation of related record ids""" if len(self.bibrec_links) == 1: return self.bibrec_links[0]["recid"] return "[" + ",".join([str(el["recid"]) for el in self.bibrec_links]) + "]" def __str__(self): """ @return: an easy to be I{grepped} string representation of the whole C{BibDoc} content. @rtype: string """ recids = self.format_recids() out = '%s:%i:::doctype=%s\n' % (recids, self.id, self.doctype) out += '%s:%i:::status=%s\n' % (recids, self.id, self.status) out += '%s:%i:::basedir=%s\n' % (recids, self.id, self.basedir) out += '%s:%i:::creation date=%s\n' % (recids, self.id, self.cd) out += '%s:%i:::modification date=%s\n' % (recids, self.id, self.md) out += '%s:%i:::text extraction date=%s\n' % (recids, self.id, self.td) out += '%s:%i:::total file attached=%s\n' % (recids, self.id, len(self.docfiles)) if self.human_readable: out += '%s:%i:::total size latest version=%s\n' % (recids, self.id, nice_size(self.get_total_size_latest_version())) out += '%s:%i:::total size all files=%s\n' % (recids, self.id, nice_size(self.get_total_size())) else: out += '%s:%i:::total size latest version=%s\n' % (recids, self.id, self.get_total_size_latest_version()) out += '%s:%i:::total size all files=%s\n' % (recids, self.id, self.get_total_size()) for docfile in self.docfiles: out += str(docfile) return out def get_md5s(self): """ @return: an instance of the Md5Folder class to access MD5 information of the current BibDoc @rtype: Md5Folder """ if self.__md5s is None: self.__md5s = Md5Folder(self.basedir) return self.__md5s md5s = property(get_md5s) def format_already_exists_p(self, docformat): """ @param format: a format to be checked. @type format: string @return: True if a file of the given format already exists among the latest files. @rtype: bool """ docformat = normalize_format(docformat) for afile in self.list_latest_files(): if docformat == afile.get_format(): return True return False def get_status(self): """ @return: the status information. @rtype: string """ return self.status @staticmethod def get_fileprefix(basedir, storagename=None): fname = "%s" % (storagename or "content", ) return os.path.join(basedir, fname ) def get_filepath(self, docformat, version): """ Generaters the path inside of the filesystem where the document should be stored. @param format The format of the document @type format string @param version version to be stored in the file @type version string TODO: this should be completely replaced. File storage (and so, also path building) should be abstracted from BibDoc and be using loadable extensions @param format Format of the document to be stored @type format string @param version Version of the document to be stored @type version String @return Full path to the file encoding a particular version and format of the document @trype string """ return "%s%s;%i" % (BibDoc.get_fileprefix(self.basedir, self.storagename), docformat, version) def get_docname(self): """Obsolete !! (will return empty String for new format documents""" return self.storagename def get_doctype(self, recid): """Retrieves the type of this document in the scope of a given recid""" link_types = [attachement["doctype"] for attachement in \ filter(lambda x: str(x["recid"]) == str(recid), \ self.bibrec_links)] if link_types: return link_types[0] return "" def touch(self): """ Update the modification time of the bibdoc (as in the UNIX command C{touch}). """ run_sql('UPDATE bibdoc SET modification_date=NOW() WHERE id=%s', (self.id, )) #if self.recid: #run_sql('UPDATE bibrec SET modification_date=NOW() WHERE id=%s', (self.recid, )) def change_doctype(self, new_doctype): """ Modify the doctype of a BibDoc """ run_sql('UPDATE bibdoc SET doctype=%s WHERE id=%s', (new_doctype, self.id)) run_sql('UPDATE bibrec_bibdoc SET type=%s WHERE id_bibdoc=%s', (new_doctype, self.id)) def set_status(self, new_status): """ Set a new status. A document with a status information is a restricted document that can be accessed only to user which as an authorization to the I{viewrestrdoc} WebAccess action with keyword status with value C{new_status}. @param new_status: the new status. If empty the document will be unrestricted. @type new_status: string @raise InvenioBibDocFileError: in case the reserved word 'DELETED' is used. """ if new_status != KEEP_OLD_VALUE: if new_status == 'DELETED': raise InvenioBibDocFileError('DELETED is a reserved word and can not be used for setting the status') run_sql('UPDATE bibdoc SET status=%s WHERE id=%s', (new_status, self.id)) self.status = new_status self.touch() self._build_file_list() def add_file_new_version(self, filename, description=None, comment=None, docformat=None, flags=None, modification_date=None): """ Add a new version of a file. If no physical file is already attached to the document a the given file will have version 1. Otherwise the new file will have the current version number plus one. @param filename: the local path of the file. @type filename: string @param description: an optional description for the file. @type description: string @param comment: an optional comment to the file. @type comment: string @param format: the extension of the file. If not specified it will be retrieved from the filename (see L{decompose_file}). @type format: string @param flags: a set of flags to be associated with the file (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}) @type flags: list of string @raise InvenioBibDocFileError: in case of error. """ try: latestVersion = self.get_latest_version() if latestVersion == 0: myversion = 1 else: myversion = latestVersion + 1 if os.path.exists(filename): if not os.path.getsize(filename) > 0: raise InvenioBibDocFileError, "%s seems to be empty" % filename if docformat is None: docformat = decompose_file(filename)[2] else: docformat = normalize_format(docformat) destination = self.get_filepath(docformat, myversion) if run_sql("SELECT id_bibdoc FROM bibdocfsinfo WHERE id_bibdoc=%s AND version=%s AND format=%s", (self.id, myversion, docformat)): raise InvenioBibDocFileError("According to the database a file of format %s is already attached to the docid %s" % (docformat, self.id)) try: shutil.copyfile(filename, destination) os.chmod(destination, 0644) if modification_date: # if the modification time of the file needs to be changed update_modification_date_of_file(destination, modification_date) except Exception, e: register_exception() raise InvenioBibDocFileError, "Encountered an exception while copying '%s' to '%s': '%s'" % (filename, destination, e) self.more_info.set_description(description, docformat, myversion) self.more_info.set_comment(comment, docformat, myversion) if flags is None: flags = [] if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags: flags.append('PDF/A') for flag in flags: if flag == 'PERFORM_HIDE_PREVIOUS': for afile in self.list_all_files(): docformat = afile.get_format() version = afile.get_version() if version < myversion: self.more_info.set_flag('HIDDEN', docformat, myversion) else: self.more_info.set_flag(flag, docformat, myversion) else: raise InvenioBibDocFileError, "'%s' does not exists!" % filename finally: self.touch() Md5Folder(self.basedir).update() self._build_file_list() just_added_file = self.get_file(docformat, myversion) run_sql("INSERT INTO bibdocfsinfo(id_bibdoc, version, format, last_version, cd, md, checksum, filesize, mime) VALUES(%s, %s, %s, true, %s, %s, %s, %s, %s)", (self.id, myversion, docformat, just_added_file.cd, just_added_file.md, just_added_file.get_checksum(), just_added_file.get_size(), just_added_file.mime)) run_sql("UPDATE bibdocfsinfo SET last_version=false WHERE id_bibdoc=%s AND version<%s", (self.id, myversion)) def add_file_new_format(self, filename, version=None, description=None, comment=None, docformat=None, flags=None, modification_date=None): """ Add a file as a new format. @param filename: the local path of the file. @type filename: string @param version: an optional specific version to which the new format should be added. If None, the last version will be used. @type version: integer @param description: an optional description for the file. @type description: string @param comment: an optional comment to the file. @type comment: string @param format: the extension of the file. If not specified it will be retrieved from the filename (see L{decompose_file}). @type format: string @param flags: a set of flags to be associated with the file (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}) @type flags: list of string @raise InvenioBibDocFileError: if the given format already exists. """ try: if version is None: version = self.get_latest_version() if version == 0: version = 1 if os.path.exists(filename): if not os.path.getsize(filename) > 0: raise InvenioBibDocFileError, "%s seems to be empty" % filename if docformat is None: docformat = decompose_file(filename)[2] else: docformat = normalize_format(docformat) if run_sql("SELECT id_bibdoc FROM bibdocfsinfo WHERE id_bibdoc=%s AND version=%s AND format=%s", (self.id, version, docformat)): raise InvenioBibDocFileError("According to the database a file of format %s is already attached to the docid %s" % (docformat, self.id)) destination = self.get_filepath(docformat, version) if os.path.exists(destination): raise InvenioBibDocFileError, "A file for docid '%s' already exists for the format '%s'" % (str(self.id), docformat) try: shutil.copyfile(filename, destination) os.chmod(destination, 0644) if modification_date: # if the modification time of the file needs to be changed update_modification_date_of_file(destination, modification_date) except Exception, e: register_exception() raise InvenioBibDocFileError, "Encountered an exception while copying '%s' to '%s': '%s'" % (filename, destination, e) self.more_info.set_comment(comment, docformat, version) self.more_info.set_description(description, docformat, version) if flags is None: flags = [] if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags: flags.append('PDF/A') for flag in flags: if flag != 'PERFORM_HIDE_PREVIOUS': self.more_info.set_flag(flag, docformat, version) else: raise InvenioBibDocFileError, "'%s' does not exists!" % filename finally: Md5Folder(self.basedir).update() self.touch() self._build_file_list() just_added_file = self.get_file(docformat, version) run_sql("INSERT INTO bibdocfsinfo(id_bibdoc, version, format, last_version, cd, md, checksum, filesize, mime) VALUES(%s, %s, %s, true, %s, %s, %s, %s, %s)", (self.id, version, docformat, just_added_file.cd, just_added_file.md, just_added_file.get_checksum(), just_added_file.get_size(), just_added_file.mime)) def change_docformat(self, oldformat, newformat): """ Renames a format name on disk and in all BibDoc structures. The change will touch only the last version files. The change will take place only if the newformat doesn't already exist. @param oldformat: the format that needs to be renamed @type oldformat: string @param newformat: the format new name @type newformat: string """ oldformat = normalize_format(oldformat) newformat = normalize_format(newformat) if self.format_already_exists_p(newformat): # same format already exists in the latest files, abort return for bibdocfile in self.list_latest_files(): if bibdocfile.get_format() == oldformat: # change format -> rename x.oldformat -> x.newformat dirname, base, docformat, version = decompose_file_with_version(bibdocfile.get_full_path()) os.rename(bibdocfile.get_full_path(), os.path.join(dirname, '%s%s;%i' %(base, newformat, version))) Md5Folder(self.basedir).update() self.touch() self._build_file_list('rename') self._sync_to_db() return def purge(self): """ Physically removes all the previous version of the given bibdoc. Everything but the last formats will be erased. """ version = self.get_latest_version() if version > 1: for afile in self.docfiles: if afile.get_version() < version: self.more_info.unset_comment(afile.get_format(), afile.get_version()) self.more_info.unset_description(afile.get_format(), afile.get_version()) for flag in CFG_BIBDOCFILE_AVAILABLE_FLAGS: self.more_info.unset_flag(flag, afile.get_format(), afile.get_version()) try: os.remove(afile.get_full_path()) except Exception, dummy: register_exception() Md5Folder(self.basedir).update() self.touch() self._build_file_list() run_sql("DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s AND version<%s", (self.id, version)) def expunge(self): """ Physically remove all the traces of a given document. @note: an expunged BibDoc object shouldn't be used anymore or the result might be unpredicted. """ del self.__md5s self.more_info.delete() del self.more_info os.system('rm -rf %s' % escape_shell_arg(self.basedir)) run_sql('DELETE FROM bibrec_bibdoc WHERE id_bibdoc=%s', (self.id, )) run_sql('DELETE FROM bibdoc_bibdoc WHERE id_bibdoc1=%s OR id_bibdoc2=%s', (self.id, self.id)) run_sql('DELETE FROM bibdoc WHERE id=%s', (self.id, )) run_sql('INSERT DELAYED INTO hstDOCUMENT(action, id_bibdoc, doctimestamp) VALUES("EXPUNGE", %s, NOW())', (self.id, )) run_sql('DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s', (self.id, )) del self.docfiles del self.id del self.cd del self.md del self.td del self.basedir del self.doctype del self.bibrec_links def revert(self, version): """ Revert the document to a given version. All the formats corresponding to that version are copied forward to a new version. @param version: the version to revert to. @type version: integer @raise InvenioBibDocFileError: in case of errors """ version = int(version) docfiles = self.list_version_files(version) if docfiles: self.add_file_new_version(docfiles[0].get_full_path(), description=docfiles[0].get_description(), comment=docfiles[0].get_comment(), docformat=docfiles[0].get_format(), flags=docfiles[0].flags) for docfile in docfiles[1:]: self.add_file_new_format(docfile.filename, description=docfile.get_description(), comment=docfile.get_comment(), docformat=docfile.get_format(), flags=docfile.flags) def import_descriptions_and_comments_from_marc(self, record=None): """ Import descriptions and comments from the corresponding MARC metadata. @param record: the record (if None it will be calculated). @type record: bibrecord recstruct @note: If record is passed it is directly used, otherwise it is retrieved from the MARCXML stored in the database. """ ## Let's get the record from invenio.search_engine import get_record if record is None: record = get_record(self.id) fields = record_get_field_instances(record, '856', '4', ' ') global_comment = None global_description = None local_comment = {} local_description = {} for field in fields: url = field_get_subfield_values(field, 'u') if url: ## Given a url url = url[0] if re.match('%s/%s/[0-9]+/files/' % (CFG_SITE_URL, CFG_SITE_RECORD), url): ## If it is a traditional /CFG_SITE_RECORD/1/files/ one ## We have global description/comment for all the formats description = field_get_subfield_values(field, 'y') if description: global_description = description[0] comment = field_get_subfield_values(field, 'z') if comment: global_comment = comment[0] elif bibdocfile_url_p(url): ## Otherwise we have description/comment per format dummy, docname, docformat = decompose_bibdocfile_url(url) brd = BibRecDocs(self.id) if docname == brd.get_docname(self.id): description = field_get_subfield_values(field, 'y') if description: local_description[docformat] = description[0] comment = field_get_subfield_values(field, 'z') if comment: local_comment[docformat] = comment[0] ## Let's update the tables version = self.get_latest_version() for docfile in self.list_latest_files(): docformat = docfile.get_format() if docformat in local_comment: self.set_comment(local_comment[docformat], docformat, version) else: self.set_comment(global_comment, docformat, version) if docformat in local_description: self.set_description(local_description[docformat], docformat, version) else: self.set_description(global_description, docformat, version) self._build_file_list('init') def get_icon(self, subformat_re=CFG_BIBDOCFILE_ICON_SUBFORMAT_RE, display_hidden=True): """ @param subformat_re: by default the convention is that L{CFG_BIBDOCFILE_ICON_SUBFORMAT_RE} is used as a subformat indicator to mean that a particular format is to be used as an icon. Specifiy a different subformat if you need to use a different convention. @type subformat_re: compiled regular expression @return: the bibdocfile corresponding to the icon of this document, or None if any icon exists for this document. @rtype: BibDocFile @warning: before I{subformat} were introduced this method was returning a BibDoc, while now is returning a BibDocFile. Check if your client code is compatible with this. """ for docfile in self.list_latest_files(list_hidden=display_hidden): if subformat_re.match(docfile.get_subformat()): return docfile return None def add_icon(self, filename, docformat=None, subformat=CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT, modification_date=None): """ Attaches icon to this document. @param filename: the local filesystem path to the icon. @type filename: string @param format: an optional format for the icon. If not specified it will be calculated after the filesystem path. @type format: string @param subformat: by default the convention is that CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT is used as a subformat indicator to mean that a particular format is to be used as an icon. Specifiy a different subformat if you need to use a different convention. @type subformat: string @raise InvenioBibDocFileError: in case of errors. """ #first check if an icon already exists if not docformat: docformat = decompose_file(filename)[2] if subformat: docformat += ";%s" % subformat self.add_file_new_format(filename, docformat=docformat, modification_date=modification_date) def delete_icon(self, subformat_re=CFG_BIBDOCFILE_ICON_SUBFORMAT_RE): """ @param subformat_re: by default the convention is that L{CFG_BIBDOCFILE_ICON_SUBFORMAT_RE} is used as a subformat indicator to mean that a particular format is to be used as an icon. Specifiy a different subformat if you need to use a different convention. @type subformat: compiled regular expression Removes the icon attached to the document if it exists. """ for docfile in self.list_latest_files(): if subformat_re.match(docfile.get_subformat()): self.delete_file(docfile.get_format(), docfile.get_version()) def change_name(self, recid, newname): """ Renames this document in connection with a given record. @param newname: the new name. @type newname: string @raise InvenioBibDocFileError: if the new name corresponds to a document already attached to the record owning this document. """ try: newname = normalize_docname(newname) res = run_sql("SELECT id_bibdoc FROM bibrec_bibdoc WHERE id_bibrec=%s AND docname=%s", (recid, newname)) if res: raise InvenioBibDocFileError, "A bibdoc called %s already exists for recid %s" % (newname, recid) run_sql("update bibrec_bibdoc set docname=%s where id_bibdoc=%s and id_bibrec=%s", (newname, self.id, recid)) finally: self.touch() def set_comment(self, comment, docformat, version=None): """ Updates the comment of a specific format/version of the document. @param comment: the new comment. @type comment: string @param format: the specific format for which the comment should be updated. @type format: string @param version: the specific version for which the comment should be updated. If not specified the last version will be used. @type version: integer """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) self.more_info.set_comment(comment, docformat, version) self.touch() self._build_file_list('init') def set_description(self, description, docformat, version=None): """ Updates the description of a specific format/version of the document. @param description: the new description. @type description: string @param format: the specific format for which the description should be updated. @type format: string @param version: the specific version for which the description should be updated. If not specified the last version will be used. @type version: integer """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) self.more_info.set_description(description, docformat, version) self.touch() self._build_file_list('init') def set_flag(self, flagname, docformat, version=None): """ Sets a flag for a specific format/version of the document. @param flagname: a flag from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}. @type flagname: string @param format: the specific format for which the flag should be set. @type format: string @param version: the specific version for which the flag should be set. If not specified the last version will be used. @type version: integer """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) self.more_info.set_flag(flagname, docformat, version) self.touch() self._build_file_list('init') def has_flag(self, flagname, docformat, version=None): """ Checks if a particular flag for a format/version is set. @param flagname: a flag from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}. @type flagname: string @param format: the specific format for which the flag should be set. @type format: string @param version: the specific version for which the flag should be set. If not specified the last version will be used. @type version: integer @return: True if the flag is set. @rtype: bool """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) return self.more_info.has_flag(flagname, docformat, version) def unset_flag(self, flagname, docformat, version=None): """ Unsets a flag for a specific format/version of the document. @param flagname: a flag from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}. @type flagname: string @param format: the specific format for which the flag should be unset. @type format: string @param version: the specific version for which the flag should be unset. If not specified the last version will be used. @type version: integer """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) self.more_info.unset_flag(flagname, docformat, version) self.touch() self._build_file_list('init') def get_comment(self, docformat, version=None): """ Retrieve the comment of a specific format/version of the document. @param format: the specific format for which the comment should be retrieved. @type format: string @param version: the specific version for which the comment should be retrieved. If not specified the last version will be used. @type version: integer @return: the comment. @rtype: string """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) return self.more_info.get_comment(docformat, version) def get_description(self, docformat, version=None): """ Retrieve the description of a specific format/version of the document. @param format: the specific format for which the description should be retrieved. @type format: string @param version: the specific version for which the description should be retrieved. If not specified the last version will be used. @type version: integer @return: the description. @rtype: string """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) return self.more_info.get_description(docformat, version) def hidden_p(self, docformat, version=None): """ Returns True if the file specified by the given format/version is hidden. @param format: the specific format for which the description should be retrieved. @type format: string @param version: the specific version for which the description should be retrieved. If not specified the last version will be used. @type version: integer @return: True if hidden. @rtype: bool """ if version is None: version = self.get_latest_version() return self.more_info.has_flag('HIDDEN', docformat, version) def get_base_dir(self): """ @return: the base directory on the local filesystem for this document (e.g. C{/soft/cdsweb/var/data/files/g0/123}) @rtype: string """ return self.basedir def get_type(self): """ @return: the type of this document. @rtype: string""" return self.doctype def get_id(self): """ @return: the id of this document. @rtype: integer """ return self.id def get_file(self, docformat, version=""): """ Returns a L{BibDocFile} instance of this document corresponding to the specific format and version. @param format: the specific format. @type format: string @param version: the specific version for which the description should be retrieved. If not specified the last version will be used. @type version: integer @return: the L{BibDocFile} instance. @rtype: BibDocFile """ if version == "": docfiles = self.list_latest_files() else: version = int(version) docfiles = self.list_version_files(version) docformat = normalize_format(docformat) for docfile in docfiles: if (docfile.get_format() == docformat or not docformat): return docfile ## Let's skip the subformat specification and consider just the ## superformat superformat = get_superformat_from_format(docformat) for docfile in docfiles: if get_superformat_from_format(docfile.get_format()) == superformat: return docfile raise InvenioBibDocFileError, "No file for doc %i of format '%s', version '%s'" % (self.id, docformat, version) def list_versions(self): """ @return: the list of existing version numbers for this document. @rtype: list of integer """ versions = [] for docfile in self.docfiles: if not docfile.get_version() in versions: versions.append(docfile.get_version()) versions.sort() return versions def delete(self, recid = None): """ Delete this document. @see: L{undelete} for how to undelete the document. @raise InvenioBibDocFileError: in case of errors. """ try: today = datetime.today() recids = [] if recid: recids = [recid] else: recids = [link["recid"] for link in self.bibrec_links] for rid in recids: brd = BibRecDocs(rid) docname = brd.get_docname(self.id) # if the document is attached to some records brd.change_name(docid=self.id, newname = 'DELETED-%s%s-%s' % (today.strftime('%Y%m%d%H%M%S'), today.microsecond, docname)) run_sql("UPDATE bibdoc SET status='DELETED' WHERE id=%s", (self.id,)) self.status = 'DELETED' except Exception, e: register_exception() raise InvenioBibDocFileError, "It's impossible to delete bibdoc %s: %s" % (self.id, e) def deleted_p(self): """ @return: True if this document has been deleted. @rtype: bool """ return self.status == 'DELETED' def empty_p(self): """ @return: True if this document is empty, i.e. it has no bibdocfile connected. @rtype: bool """ return len(self.docfiles) == 0 def undelete(self, previous_status='', recid=None): """ Undelete a deleted file (only if it was actually deleted via L{delete}). The previous C{status}, i.e. the restriction key can be provided. Otherwise the undeleted document will be public. @param previous_status: the previous status the should be restored. @type previous_status: string @raise InvenioBibDocFileError: in case of any error. """ try: run_sql("UPDATE bibdoc SET status=%s WHERE id=%s AND status='DELETED'", (previous_status, self.id)) except Exception, e: raise InvenioBibDocFileError, "It's impossible to undelete bibdoc %s: %s" % (self.id, e) if recid: bibrecdocs = BibRecDocs(recid) docname = bibrecdocs.get_docname(self.id) if docname.startswith('DELETED-'): try: # Let's remove DELETED-20080214144322- in front of the docname original_name = '-'.join(docname.split('-')[2:]) original_name = bibrecdocs.propose_unique_docname(original_name) bibrecdocs.change_name(docid=self.id, newname=original_name) except Exception, e: raise InvenioBibDocFileError, "It's impossible to restore the previous docname %s. %s kept as docname because: %s" % (original_name, docname, e) else: raise InvenioBibDocFileError, "Strange just undeleted docname isn't called DELETED-somedate-docname but %s" % docname def delete_file(self, docformat, version): """ Delete a specific format/version of this document on the filesystem. @param format: the particular format to be deleted. @type format: string @param version: the particular version to be deleted. @type version: integer @note: this operation is not reversible!""" try: afile = self.get_file(docformat, version) except InvenioBibDocFileError: return try: os.remove(afile.get_full_path()) run_sql("DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s AND version=%s AND format=%s", (self.id, afile.get_version(), afile.get_format())) last_version = run_sql("SELECT max(version) FROM bibdocfsinfo WHERE id_bibdoc=%s", (self.id, ))[0][0] if last_version: ## Updating information about last version run_sql("UPDATE bibdocfsinfo SET last_version=true WHERE id_bibdoc=%s AND version=%s", (self.id, last_version)) run_sql("UPDATE bibdocfsinfo SET last_version=false WHERE id_bibdoc=%s AND version<>%s", (self.id, last_version)) except OSError: pass self.touch() self._build_file_list() def get_history(self): """ @return: a human readable and parsable string that represent the history of this document. @rtype: string """ ret = [] hst = run_sql("""SELECT action, docname, docformat, docversion, docsize, docchecksum, doctimestamp FROM hstDOCUMENT WHERE id_bibdoc=%s ORDER BY doctimestamp ASC""", (self.id, )) for row in hst: ret.append("%s %s '%s', format: '%s', version: %i, size: %s, checksum: '%s'" % (row[6].strftime('%Y-%m-%d %H:%M:%S'), row[0], row[1], row[2], row[3], nice_size(row[4]), row[5])) return ret def _build_file_list(self, context=''): """ Lists all files attached to the bibdoc. This function should be called everytime the bibdoc is modified. As a side effect it log everything that has happened to the bibdocfiles in the log facility, according to the context: "init": means that the function has been called; for the first time by a constructor, hence no logging is performed "": by default means to log every deleted file as deleted and every added file as added; "rename": means that every appearently deleted file is logged as renamef and every new file as renamet. """ def log_action(action, docid, docname, docformat, version, size, checksum, timestamp=''): """Log an action into the bibdoclog table.""" try: if timestamp: run_sql('INSERT DELAYED INTO hstDOCUMENT(action, id_bibdoc, docname, docformat, docversion, docsize, docchecksum, doctimestamp) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)', (action, docid, docname, docformat, version, size, checksum, timestamp)) else: run_sql('INSERT DELAYED INTO hstDOCUMENT(action, id_bibdoc, docname, docformat, docversion, docsize, docchecksum, doctimestamp) VALUES(%s, %s, %s, %s, %s, %s, %s, NOW())', (action, docid, docname, docformat, version, size, checksum)) except DatabaseError: register_exception() def make_removed_added_bibdocfiles(previous_file_list): """Internal function for build the log of changed files.""" # Let's rebuild the previous situation old_files = {} for bibdocfile in previous_file_list: old_files[(bibdocfile.name, bibdocfile.format, bibdocfile.version)] = (bibdocfile.size, bibdocfile.checksum, bibdocfile.md) # Let's rebuild the new situation new_files = {} for bibdocfile in self.docfiles: new_files[(bibdocfile.name, bibdocfile.format, bibdocfile.version)] = (bibdocfile.size, bibdocfile.checksum, bibdocfile.md) # Let's subtract from added file all the files that are present in # the old list, and let's add to deleted files that are not present # added file. added_files = dict(new_files) deleted_files = {} for key, value in old_files.iteritems(): if added_files.has_key(key): del added_files[key] else: deleted_files[key] = value return (added_files, deleted_files) if context != ('init', 'init_from_disk'): previous_file_list = list(self.docfiles) res = run_sql("SELECT status, creation_date," "modification_date FROM bibdoc WHERE id=%s", (self.id,)) self.cd = res[0][1] self.md = res[0][2] self.status = res[0][0] self.more_info = BibDocMoreInfo(self.id) self.docfiles = [] if CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE and context == 'init': ## In normal init context we read from DB res = run_sql("SELECT version, format, cd, md, checksum, filesize FROM bibdocfsinfo WHERE id_bibdoc=%s", (self.id, )) for version, docformat, cd, md, checksum, size in res: filepath = self.get_filepath(docformat, version) self.docfiles.append(BibDocFile( filepath, self.bibrec_types, version, docformat, self.id, self.status, checksum, self.more_info, human_readable=self.human_readable, cd=cd, md=md, size=size, bibdoc=self)) else: if os.path.exists(self.basedir): files = os.listdir(self.basedir) files.sort() for afile in files: if not afile.startswith('.'): try: filepath = os.path.join(self.basedir, afile) dummy, dummy, docformat, fileversion = decompose_file_with_version(filepath) checksum = self.md5s.get_checksum(afile) self.docfiles.append(BibDocFile(filepath, self.bibrec_types, fileversion, docformat, self.id, self.status, checksum, self.more_info, human_readable=self.human_readable, bibdoc=self)) except Exception, dummy: register_exception() if context in ('init', 'init_from_disk'): return else: added_files, deleted_files = make_removed_added_bibdocfiles(previous_file_list) deletedstr = "DELETED" addedstr = "ADDED" if context == 'rename': deletedstr = "RENAMEDFROM" addedstr = "RENAMEDTO" for (docname, docformat, version), (size, checksum, md) in added_files.iteritems(): if context == 'rename': md = '' # No modification time log_action(addedstr, self.id, docname, docformat, version, size, checksum, md) for (docname, docformat, version), (size, checksum, md) in deleted_files.iteritems(): if context == 'rename': md = '' # No modification time log_action(deletedstr, self.id, docname, docformat, version, size, checksum, md) def _sync_to_db(self): """ Update the content of the bibdocfile table by taking what is available on the filesystem. """ self._build_file_list('init_from_disk') run_sql("DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s", (self.id,)) for afile in self.docfiles: run_sql("INSERT INTO bibdocfsinfo(id_bibdoc, version, format, last_version, cd, md, checksum, filesize, mime) VALUES(%s, %s, %s, false, %s, %s, %s, %s, %s)", (self.id, afile.get_version(), afile.get_format(), afile.cd, afile.md, afile.get_checksum(), afile.get_size(), afile.mime)) run_sql("UPDATE bibdocfsinfo SET last_version=true WHERE id_bibdoc=%s AND version=%s", (self.id, self.get_latest_version())) def _build_related_file_list(self): """Lists all files attached to the bibdoc. This function should be called everytime the bibdoc is modified within e.g. its icon. @deprecated: use subformats instead. """ self.related_files = {} res = run_sql("SELECT ln.id_bibdoc2,ln.rel_type,bibdoc.status FROM " "bibdoc_bibdoc AS ln,bibdoc WHERE bibdoc.id=ln.id_bibdoc2 AND " "ln.id_bibdoc1=%s", (str(self.id),)) for row in res: docid = row[0] doctype = row[1] if row[2] != 'DELETED': if not self.related_files.has_key(doctype): self.related_files[doctype] = [] cur_doc = BibDoc.create_instance(docid=docid, human_readable=self.human_readable) self.related_files[doctype].append(cur_doc) def get_total_size_latest_version(self): """Return the total size used on disk of all the files belonging to this bibdoc and corresponding to the latest version.""" ret = 0 for bibdocfile in self.list_latest_files(): ret += bibdocfile.get_size() return ret def get_total_size(self): """Return the total size used on disk of all the files belonging to this bibdoc.""" ret = 0 for bibdocfile in self.list_all_files(): ret += bibdocfile.get_size() return ret def list_all_files(self, list_hidden=True): """Returns all the docfiles linked with the given bibdoc.""" if list_hidden: return self.docfiles else: return [afile for afile in self.docfiles if not afile.hidden_p()] def list_latest_files(self, list_hidden=True): """Returns all the docfiles within the last version.""" return self.list_version_files(self.get_latest_version(), list_hidden=list_hidden) def list_version_files(self, version, list_hidden=True): """Return all the docfiles of a particular version.""" version = int(version) return [docfile for docfile in self.docfiles if docfile.get_version() == version and (list_hidden or not docfile.hidden_p())] def get_latest_version(self): """ Returns the latest existing version number for the given bibdoc. If no file is associated to this bibdoc, returns '0'. """ version = 0 for bibdocfile in self.docfiles: if bibdocfile.get_version() > version: version = bibdocfile.get_version() return version def get_file_number(self): """Return the total number of files.""" return len(self.docfiles) def register_download(self, ip_address, version, docformat, userid=0, recid=0): """Register the information about a download of a particular file.""" docformat = normalize_format(docformat) if docformat[:1] == '.': docformat = docformat[1:] docformat = docformat.upper() if not version: version = self.get_latest_version() return run_sql("INSERT DELAYED INTO rnkDOWNLOADS " "(id_bibrec,id_bibdoc,file_version,file_format," "id_user,client_host,download_time) VALUES " "(%s,%s,%s,%s,%s,INET_ATON(%s),NOW())", (recid, self.id, version, docformat, userid, ip_address,)) def get_incoming_relations(self, rel_type=None): """Return all relations in which this BibDoc appears on target position @param rel_type: Type of the relation, to which we want to limit our search. None = any type @type rel_type: string @return: List of BibRelation instances @rtype: list """ return BibRelation.get_relations(rel_type = rel_type, bibdoc2_id = self.id) def get_outgoing_relations(self, rel_type=None): """Return all relations in which this BibDoc appears on target position @param rel_type: Type of the relation, to which we want to limit our search. None = any type @type rel_type: string @return: List of BibRelation instances @rtype: list """ return BibRelation.get_relations(rel_type = rel_type, bibdoc1_id = self.id) def create_outgoing_relation(self, bibdoc2, rel_type): """ Create an outgoing relation between current BibDoc and a different one """ return BibRelation.create(bibdoc1_id = self.id, bibdoc2_id = bibdoc2.id, rel_type = rel_type) def create_incoming_relation(self, bibdoc1, rel_type): """ Create an outgoing relation between a particular version of current BibDoc and a particular version of a different BibDoc """ return BibRelation.create(bibdoc1_id = bibdoc1.id, bibdoc2_id = self.id, rel_type = rel_type) def generic_path2bidocfile(fullpath): """ Returns a BibDocFile objects that wraps the given fullpath. @note: the object will contain the minimum information that can be guessed from the fullpath (e.g. docname, format, subformat, version, md5, creation_date, modification_date). It won't contain for example a comment, a description, a doctype, a restriction. """ fullpath = os.path.abspath(fullpath) try: path, name, docformat, version = decompose_file_with_version(fullpath) except ValueError: ## There is no version version = 0 path, name, docformat = decompose_file(fullpath) md5folder = Md5Folder(path) checksum = md5folder.get_checksum(os.path.basename(fullpath)) return BibDocFile(fullpath=fullpath, recid_doctypes=[(0, None, name)], version=version, docformat=docformat, docid=0, status=None, checksum=checksum, more_info=None) class BibDocFile(object): """This class represents a physical file in the Invenio filesystem. It should never be instantiated directly""" def __init__(self, fullpath, recid_doctypes, version, docformat, docid, status, checksum, more_info=None, human_readable=False, cd=None, md=None, size=None, bibdoc = None): self.fullpath = os.path.abspath(fullpath) self.docid = docid self.recids_doctypes = recid_doctypes self.version = version self.status = status self.checksum = checksum self.human_readable = human_readable self.name = recid_doctypes[0][2] self.bibdoc = bibdoc if more_info: self.description = more_info.get_description(docformat, version) self.comment = more_info.get_comment(docformat, version) self.flags = more_info.get_flags(docformat, version) else: self.description = None self.comment = None self.flags = [] self.format = normalize_format(docformat) self.superformat = get_superformat_from_format(self.format) self.subformat = get_subformat_from_format(self.format) if docformat: self.recids_doctypes = [(a,b,c+self.superformat) for (a,b,c) in self.recids_doctypes] self.mime, self.encoding = _mimes.guess_type(self.recids_doctypes[0][2]) if self.mime is None: self.mime = "application/octet-stream" self.more_info = more_info self.hidden = 'HIDDEN' in self.flags self.size = size or os.path.getsize(fullpath) self.md = md or datetime.fromtimestamp(os.path.getmtime(fullpath)) try: self.cd = cd or datetime.fromtimestamp(os.path.getctime(fullpath)) except OSError: self.cd = self.md self.dir = os.path.dirname(fullpath) if self.subformat: self.url = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], self.name, self.superformat), {'subformat' : self.subformat}) self.fullurl = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], self.name, self.superformat), {'subformat' : self.subformat, 'version' : self.version}) else: self.url = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], self.name, self.superformat), {}) self.fullurl = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], self.name, self.superformat), {'version' : self.version}) self.etag = '"%i%s%i"' % (self.docid, self.format, self.version) self.magic = None def __repr__(self): return ('BibDocFile(%s, %i, %s, %s, %i, %i, %s, %s, %s, %s)' % (repr(self.fullpath), self.version, repr(self.name), repr(self.format), self.recids_doctypes[0][0], self.docid, repr(self.status), repr(self.checksum), repr(self.more_info), repr(self.human_readable))) def format_recids(self): if self.bibdoc: return self.bibdoc.format_recids() return "0" def __str__(self): recids = self.format_recids() out = '%s:%s:%s:%s:fullpath=%s\n' % (recids, self.docid, self.version, self.format, self.fullpath) out += '%s:%s:%s:%s:name=%s\n' % (recids, self.docid, self.version, self.format, self.name) out += '%s:%s:%s:%s:subformat=%s\n' % (recids, self.docid, self.version, self.format, get_subformat_from_format(self.format)) out += '%s:%s:%s:%s:status=%s\n' % (recids, self.docid, self.version, self.format, self.status) out += '%s:%s:%s:%s:checksum=%s\n' % (recids, self.docid, self.version, self.format, self.checksum) if self.human_readable: out += '%s:%s:%s:%s:size=%s\n' % (recids, self.docid, self.version, self.format, nice_size(self.size)) else: out += '%s:%s:%s:%s:size=%s\n' % (recids, self.docid, self.version, self.format, self.size) out += '%s:%s:%s:%s:creation time=%s\n' % (recids, self.docid, self.version, self.format, self.cd) out += '%s:%s:%s:%s:modification time=%s\n' % (recids, self.docid, self.version, self.format, self.md) out += '%s:%s:%s:%s:magic=%s\n' % (recids, self.docid, self.version, self.format, self.get_magic()) out += '%s:%s:%s:%s:mime=%s\n' % (recids, self.docid, self.version, self.format, self.mime) out += '%s:%s:%s:%s:encoding=%s\n' % (recids, self.docid, self.version, self.format, self.encoding) out += '%s:%s:%s:%s:url=%s\n' % (recids, self.docid, self.version, self.format, self.url) out += '%s:%s:%s:%s:fullurl=%s\n' % (recids, self.docid, self.version, self.format, self.fullurl) out += '%s:%s:%s:%s:description=%s\n' % (recids, self.docid, self.version, self.format, self.description) out += '%s:%s:%s:%s:comment=%s\n' % (recids, self.docid, self.version, self.format, self.comment) out += '%s:%s:%s:%s:hidden=%s\n' % (recids, self.docid, self.version, self.format, self.hidden) out += '%s:%s:%s:%s:flags=%s\n' % (recids, self.docid, self.version, self.format, self.flags) out += '%s:%s:%s:%s:etag=%s\n' % (recids, self.docid, self.version, self.format, self.etag) return out def is_restricted(self, user_info): """Returns restriction state. (see acc_authorize_action return values)""" if self.status not in ('', 'DELETED'): return check_bibdoc_authorization(user_info, status=self.status) elif self.status == 'DELETED': return (1, 'File has ben deleted') else: return (0, '') def is_icon(self, subformat_re=CFG_BIBDOCFILE_ICON_SUBFORMAT_RE): """ @param subformat_re: by default the convention is that L{CFG_BIBDOCFILE_ICON_SUBFORMAT_RE} is used as a subformat indicator to mean that a particular format is to be used as an icon. Specifiy a different subformat if you need to use a different convention. @type subformat: compiled regular expression @return: True if this file is an icon. @rtype: bool """ return bool(subformat_re.match(self.subformat)) def hidden_p(self): return self.hidden def get_url(self): return self.url def get_type(self): """Returns the first type connected with the bibdoc of this file.""" return self.recids_doctypes[0][1] def get_path(self): return self.fullpath def get_bibdocid(self): return self.docid def get_name(self): return self.name def get_full_name(self): """Returns the first name connected with the bibdoc of this file.""" return self.recids_doctypes[0][2] def get_full_path(self): return self.fullpath def get_format(self): return self.format def get_subformat(self): return self.subformat def get_superformat(self): return self.superformat def get_size(self): return self.size def get_version(self): return self.version def get_checksum(self): return self.checksum def get_description(self): return self.description def get_comment(self): return self.comment def get_content(self): """Returns the binary content of the file.""" content_fd = open(self.fullpath, 'rb') content = content_fd.read() content_fd.close() return content def get_recid(self): """Returns the first recid connected with the bibdoc of this file.""" return self.recids_doctypes[0][0] def get_status(self): """Returns the status of the file, i.e. either '', 'DELETED' or a restriction keyword.""" return self.status def get_magic(self): """Return all the possible guesses from the magic library about the content of the file.""" if self.magic is None: if CFG_HAS_MAGIC == 1: magic_cookies = _get_magic_cookies() magic_result = [] for key in magic_cookies.keys(): magic_result.append(magic_cookies[key].file(self.fullpath)) self.magic = tuple(magic_result) elif CFG_HAS_MAGIC == 2: magic_result = [] for key in ({'mime': False, 'mime_encoding': False}, {'mime': True, 'mime_encoding': False}, {'mime': False, 'mime_encoding': True}): magic_result.append(_magic_wrapper(self.fullpath, **key)) self.magic = tuple(magic_result) return self.magic def check(self): """Return True if the checksum corresponds to the file.""" return calculate_md5(self.fullpath) == self.checksum def stream(self, req, download=False): """Stream the file. Note that no restriction check is being done here, since restrictions have been checked previously inside websubmit_webinterface.py.""" if os.path.exists(self.fullpath): if random.random() < CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY and calculate_md5(self.fullpath) != self.checksum: raise InvenioBibDocFileError, "File %s, version %i, is corrupted!" % (self.recids_doctypes[0][2], self.version) stream_file(req, self.fullpath, "%s%s" % (self.name, self.superformat), self.mime, self.encoding, self.etag, self.checksum, self.fullurl, download=download) raise apache.SERVER_RETURN, apache.DONE else: req.status = apache.HTTP_NOT_FOUND raise InvenioBibDocFileError, "%s does not exists!" % self.fullpath _RE_STATUS_PARSER = re.compile(r'^(?Pemail|group|egroup|role|firerole|status):\s*(?P.*)$', re.S + re.I) def check_bibdoc_authorization(user_info, status): """ Check if the user is authorized to access a document protected with the given status. L{status} is a string of the form:: auth_type: auth_value where C{auth_type} can have values in:: email, group, role, firerole, status and C{auth_value} has a value interpreted againsta C{auth_type}: - C{email}: the user can access the document if his/her email matches C{auth_value} - C{group}: the user can access the document if one of the groups (local or external) of which he/she is member matches C{auth_value} - C{role}: the user can access the document if he/she belongs to the WebAccess role specified in C{auth_value} - C{firerole}: the user can access the document if he/she is implicitly matched by the role described by the firewall like role definition in C{auth_value} - C{status}: the user can access the document if he/she is authorized to for the action C{viewrestrdoc} with C{status} paramter having value C{auth_value} @note: If no C{auth_type} is specified or if C{auth_type} is not one of the above, C{auth_value} will be set to the value contained in the parameter C{status}, and C{auth_type} will be considered to be C{status}. @param user_info: the user_info dictionary @type: dict @param status: the status of the document. @type status: string @return: a tuple, of the form C{(auth_code, auth_message)} where auth_code is 0 if the authorization is granted and greater than 0 otherwise. @rtype: (int, string) @raise ValueError: in case of unexpected parsing error. """ if not status: return (0, CFG_WEBACCESS_WARNING_MSGS[0]) def parse_status(status): g = _RE_STATUS_PARSER.match(status) if g: return (g.group('type').lower(), g.group('value')) else: return ('status', status) if acc_is_user_in_role(user_info, acc_get_role_id(SUPERADMINROLE)): return (0, CFG_WEBACCESS_WARNING_MSGS[0]) auth_type, auth_value = parse_status(status) if auth_type == 'status': return acc_authorize_action(user_info, 'viewrestrdoc', status=auth_value) elif auth_type == 'email': if not auth_value.lower().strip() == user_info['email'].lower().strip(): return (1, 'You must be member of the group %s in order to access this document' % repr(auth_value)) elif auth_type == 'group': if not auth_value in user_info['group']: return (1, 'You must be member of the group %s in order to access this document' % repr(auth_value)) elif auth_type == 'role': if not acc_is_user_in_role(user_info, acc_get_role_id(auth_value)): return (1, 'You must be member in the role %s in order to access this document' % repr(auth_value)) elif auth_type == 'firerole': if not acc_firerole_check_user(user_info, compile_role_definition(auth_value)): return (1, 'You must be authorized in order to access this document') else: raise ValueError, 'Unexpected authorization type %s for %s' % (repr(auth_type), repr(auth_value)) return (0, CFG_WEBACCESS_WARNING_MSGS[0]) _RE_BAD_MSIE = re.compile("MSIE\s+(\d+\.\d+)") def stream_file(req, fullpath, fullname=None, mime=None, encoding=None, etag=None, md5str=None, location=None, download=False): """This is a generic function to stream a file to the user. If fullname, mime, encoding, and location are not provided they will be guessed based on req and fullpath. md5str should be passed as an hexadecimal string. """ def normal_streaming(size): req.set_content_length(size) req.send_http_header() if not req.header_only: req.sendfile(fullpath) return "" def single_range(size, the_range): req.set_content_length(the_range[1]) req.headers_out['Content-Range'] = 'bytes %d-%d/%d' % (the_range[0], the_range[0] + the_range[1] - 1, size) req.status = apache.HTTP_PARTIAL_CONTENT req.send_http_header() if not req.header_only: req.sendfile(fullpath, the_range[0], the_range[1]) return "" def multiple_ranges(size, ranges, mime): req.status = apache.HTTP_PARTIAL_CONTENT boundary = '%s%04d' % (time.strftime('THIS_STRING_SEPARATES_%Y%m%d%H%M%S'), random.randint(0, 9999)) req.content_type = 'multipart/byteranges; boundary=%s' % boundary content_length = 0 for arange in ranges: content_length += len('--%s\r\n' % boundary) content_length += len('Content-Type: %s\r\n' % mime) content_length += len('Content-Range: bytes %d-%d/%d\r\n' % (arange[0], arange[0] + arange[1] - 1, size)) content_length += len('\r\n') content_length += arange[1] content_length += len('\r\n') content_length += len('--%s--\r\n' % boundary) req.set_content_length(content_length) req.send_http_header() if not req.header_only: for arange in ranges: req.write('--%s\r\n' % boundary, 0) req.write('Content-Type: %s\r\n' % mime, 0) req.write('Content-Range: bytes %d-%d/%d\r\n' % (arange[0], arange[0] + arange[1] - 1, size), 0) req.write('\r\n', 0) req.sendfile(fullpath, arange[0], arange[1]) req.write('\r\n', 0) req.write('--%s--\r\n' % boundary) req.flush() return "" def parse_date(date): """According to a date can come in three formats (in order of preference): Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123 Sunday, 06-Nov-94 08:49:37 GMT ; RFC 850, obsoleted by RFC 1036 Sun Nov 6 08:49:37 1994 ; ANSI C's asctime() format Moreover IE is adding some trailing information after a ';'. Wrong dates should be simpled ignored. This function return the time in seconds since the epoch GMT or None in case of errors.""" if not date: return None try: date = date.split(';')[0].strip() # Because of IE ## Sun, 06 Nov 1994 08:49:37 GMT return time.mktime(time.strptime(date, '%a, %d %b %Y %X %Z')) except: try: ## Sun, 06 Nov 1994 08:49:37 GMT return time.mktime(time.strptime(date, '%A, %d-%b-%y %H:%M:%S %Z')) except: try: ## Sun, 06 Nov 1994 08:49:37 GMT return time.mktime(date) except: return None def parse_ranges(ranges): """According to a (multiple) range request comes in the form: bytes=20-30,40-60,70-,-80 with the meaning: from byte to 20 to 30 inclusive (11 bytes) from byte to 40 to 60 inclusive (21 bytes) from byte 70 to (size - 1) inclusive (size - 70 bytes) from byte size - 80 to (size - 1) inclusive (80 bytes) This function will return the list of ranges in the form: [[first_byte, last_byte], ...] If first_byte or last_byte aren't specified they'll be set to None If the list is not well formatted it will return None """ try: if ranges.startswith('bytes') and '=' in ranges: ranges = ranges.split('=')[1].strip() else: return None ret = [] for arange in ranges.split(','): arange = arange.strip() if arange.startswith('-'): ret.append([None, int(arange[1:])]) elif arange.endswith('-'): ret.append([int(arange[:-1]), None]) else: ret.append(map(int, arange.split('-'))) return ret except: return None def parse_tags(tags): """Return a list of tags starting from a comma separated list.""" return [tag.strip() for tag in tags.split(',')] def fix_ranges(ranges, size): """Complementary to parse_ranges it will transform all the ranges into (first_byte, length), adjusting all the value based on the actual size provided. """ ret = [] for arange in ranges: if (arange[0] is None and arange[1] > 0) or arange[0] < size: if arange[0] is None: arange[0] = size - arange[1] elif arange[1] is None: arange[1] = size - arange[0] else: arange[1] = arange[1] - arange[0] + 1 arange[0] = max(0, arange[0]) arange[1] = min(size - arange[0], arange[1]) if arange[1] > 0: ret.append(arange) return ret def get_normalized_headers(): """Strip and lowerize all the keys of the headers dictionary plus strip, lowerize and transform known headers value into their value.""" ret = { 'if-match' : None, 'unless-modified-since' : None, 'if-modified-since' : None, 'range' : None, 'if-range' : None, 'if-none-match' : None, } for key, value in req.headers_in.iteritems(): key = key.strip().lower() value = value.strip() if key in ('unless-modified-since', 'if-modified-since'): value = parse_date(value) elif key == 'range': value = parse_ranges(value) elif key == 'if-range': value = parse_date(value) or parse_tags(value) elif key in ('if-match', 'if-none-match'): value = parse_tags(value) if value: ret[key] = value return ret headers = get_normalized_headers() g = _RE_BAD_MSIE.search(headers.get('user-agent', "MSIE 6.0")) bad_msie = g and float(g.group(1)) < 9.0 if CFG_BIBDOCFILE_USE_XSENDFILE: ## If XSendFile is supported by the server, let's use it. if os.path.exists(fullpath): if fullname is None: fullname = os.path.basename(fullpath) if bad_msie: ## IE is confused by quotes req.headers_out["Content-Disposition"] = 'attachment; filename=%s' % fullname.replace('"', '\\"') elif download: req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % fullname.replace('"', '\\"') else: ## IE is confused by inline req.headers_out["Content-Disposition"] = 'inline; filename="%s"' % fullname.replace('"', '\\"') req.headers_out["X-Sendfile"] = fullpath if mime is None: (mime, encoding) = _mimes.guess_type(fullpath) if mime is None: mime = "application/octet-stream" if not bad_msie: ## IE is confused by not supported mimetypes req.content_type = mime return "" else: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND if headers['if-match']: if etag is not None and etag not in headers['if-match']: raise apache.SERVER_RETURN, apache.HTTP_PRECONDITION_FAILED if os.path.exists(fullpath): mtime = os.path.getmtime(fullpath) if fullname is None: fullname = os.path.basename(fullpath) if mime is None: (mime, encoding) = _mimes.guess_type(fullpath) if mime is None: mime = "application/octet-stream" if location is None: location = req.uri if not bad_msie: ## IE is confused by not supported mimetypes req.content_type = mime req.encoding = encoding req.filename = fullname req.headers_out["Last-Modified"] = time.strftime('%a, %d %b %Y %X GMT', time.gmtime(mtime)) if CFG_ENABLE_HTTP_RANGE_REQUESTS: req.headers_out["Accept-Ranges"] = "bytes" else: req.headers_out["Accept-Ranges"] = "none" req.headers_out["Content-Location"] = location if etag is not None: req.headers_out["ETag"] = etag if md5str is not None: req.headers_out["Content-MD5"] = base64.encodestring(binascii.unhexlify(md5str.upper()))[:-1] if bad_msie: ## IE is confused by quotes req.headers_out["Content-Disposition"] = 'attachment; filename=%s' % fullname.replace('"', '\\"') elif download: req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % fullname.replace('"', '\\"') else: ## IE is confused by inline req.headers_out["Content-Disposition"] = 'inline; filename="%s"' % fullname.replace('"', '\\"') size = os.path.getsize(fullpath) if not size: try: raise Exception, '%s exists but is empty' % fullpath except Exception: register_exception(req=req, alert_admin=True) raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND if headers['if-modified-since'] and headers['if-modified-since'] >= mtime: raise apache.SERVER_RETURN, apache.HTTP_NOT_MODIFIED if headers['if-none-match']: if etag is not None and etag in headers['if-none-match']: raise apache.SERVER_RETURN, apache.HTTP_NOT_MODIFIED if headers['unless-modified-since'] and headers['unless-modified-since'] < mtime: return normal_streaming(size) if CFG_ENABLE_HTTP_RANGE_REQUESTS and headers['range']: try: if headers['if-range']: if etag is None or etag not in headers['if-range']: return normal_streaming(size) ranges = fix_ranges(headers['range'], size) except: return normal_streaming(size) if len(ranges) > 1: return multiple_ranges(size, ranges, mime) elif ranges: return single_range(size, ranges[0]) else: raise apache.SERVER_RETURN, apache.HTTP_RANGE_NOT_SATISFIABLE else: return normal_streaming(size) else: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND def stream_restricted_icon(req): """Return the content of the "Restricted Icon" file.""" stream_file(req, '%s/img/restricted.gif' % CFG_WEBDIR) raise apache.SERVER_RETURN, apache.DONE #def list_versions_from_array(docfiles): # """Retrieve the list of existing versions from the given docfiles list.""" # versions = [] # for docfile in docfiles: # if not docfile.get_version() in versions: # versions.append(docfile.get_version()) # versions.sort() # versions.reverse() # return versions def _make_base_dir(docid): """Given a docid it returns the complete path that should host its files.""" group = "g" + str(int(int(docid) / CFG_BIBDOCFILE_FILESYSTEM_BIBDOC_GROUP_LIMIT)) return os.path.join(CFG_BIBDOCFILE_FILEDIR, group, str(docid)) class Md5Folder(object): """Manage all the Md5 checksum about a folder""" def __init__(self, folder): """Initialize the class from the md5 checksum of a given path""" self.folder = folder self.load() def update(self, only_new=True): """Update the .md5 file with the current files. If only_new is specified then only not already calculated file are calculated.""" if not only_new: self.md5s = {} if os.path.exists(self.folder): for filename in os.listdir(self.folder): if filename not in self.md5s and not filename.startswith('.'): self.md5s[filename] = calculate_md5(os.path.join(self.folder, filename)) self.store() def store(self): """Store the current md5 dictionary into .md5""" try: old_umask = os.umask(022) md5file = open(os.path.join(self.folder, ".md5"), "w") for key, value in self.md5s.items(): md5file.write('%s *%s\n' % (value, key)) md5file.close() os.umask(old_umask) except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError("Encountered an exception while storing .md5 for folder '%s': '%s'" % (self.folder, e)) def load(self): """Load .md5 into the md5 dictionary""" self.md5s = {} md5_path = os.path.join(self.folder, ".md5") if os.path.exists(md5_path): for row in open(md5_path, "r"): md5hash = row[:32] filename = row[34:].strip() self.md5s[filename] = md5hash else: self.update() def check(self, filename=''): """Check the specified file or all the files for which it exists a hash for being coherent with the stored hash.""" if filename and filename in self.md5s.keys(): try: return self.md5s[filename] == calculate_md5(os.path.join(self.folder, filename)) except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError("Encountered an exception while loading '%s': '%s'" % (os.path.join(self.folder, filename), e)) else: for filename, md5hash in self.md5s.items(): try: if calculate_md5(os.path.join(self.folder, filename)) != md5hash: return False except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError("Encountered an exception while loading '%s': '%s'" % (os.path.join(self.folder, filename), e)) return True def get_checksum(self, filename): """Return the checksum of a physical file.""" md5hash = self.md5s.get(filename, None) if md5hash is None: self.update() # Now it should not fail! md5hash = self.md5s[filename] return md5hash def calculate_md5_external(filename): """Calculate the md5 of a physical file through md5sum Command Line Tool. This is suitable for file larger than 256Kb.""" try: md5_result = os.popen(CFG_PATH_MD5SUM + ' -b %s' % escape_shell_arg(filename)) ret = md5_result.read()[:32] md5_result.close() if len(ret) != 32: # Error in running md5sum. Let's fallback to internal # algorithm. return calculate_md5(filename, force_internal=True) else: return ret except Exception, e: raise InvenioBibDocFileError("Encountered an exception while calculating md5 for file '%s': '%s'" % (filename, e)) def calculate_md5(filename, force_internal=False): """Calculate the md5 of a physical file. This is suitable for files smaller than 256Kb.""" if not CFG_PATH_MD5SUM or force_internal or os.path.getsize(filename) < CFG_BIBDOCFILE_MD5_THRESHOLD: try: to_be_read = open(filename, "rb") computed_md5 = md5() while True: buf = to_be_read.read(CFG_BIBDOCFILE_MD5_BUFFER) if buf: computed_md5.update(buf) else: break to_be_read.close() return computed_md5.hexdigest() except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError("Encountered an exception while calculating md5 for file '%s': '%s'" % (filename, e)) else: return calculate_md5_external(filename) def bibdocfile_url_to_bibrecdocs(url): """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns a BibRecDocs object for the corresponding recid.""" recid = decompose_bibdocfile_url(url)[0] return BibRecDocs(recid) def bibdocfile_url_to_bibdoc(url): """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns a BibDoc object for the corresponding recid/docname.""" docname = decompose_bibdocfile_url(url)[1] return bibdocfile_url_to_bibrecdocs(url).get_bibdoc(docname) def bibdocfile_url_to_bibdocfile(url): """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns a BibDocFile object for the corresponding recid/docname/format.""" docformat = decompose_bibdocfile_url(url)[2] return bibdocfile_url_to_bibdoc(url).get_file(docformat) def bibdocfile_url_to_fullpath(url): """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns the fullpath for the corresponding recid/docname/format.""" return bibdocfile_url_to_bibdocfile(url).get_full_path() def bibdocfile_url_p(url): """Return True when the url is a potential valid url pointing to a fulltext owned by a system.""" if url.startswith('%s/getfile.py' % CFG_SITE_URL) or url.startswith('%s/getfile.py' % CFG_SITE_SECURE_URL): return True if not (url.startswith('%s/%s/' % (CFG_SITE_URL, CFG_SITE_RECORD)) or url.startswith('%s/%s/' % (CFG_SITE_SECURE_URL, CFG_SITE_RECORD))): return False splitted_url = url.split('/files/') return len(splitted_url) == 2 and splitted_url[0] != '' and splitted_url[1] != '' def get_docid_from_bibdocfile_fullpath(fullpath): """Given a bibdocfile fullpath (e.g. "CFG_BIBDOCFILE_FILEDIR/g0/123/bar.pdf;1") returns the docid (e.g. 123).""" if not fullpath.startswith(os.path.join(CFG_BIBDOCFILE_FILEDIR, 'g')): raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath dirname = decompose_file_with_version(fullpath)[0] try: return int(dirname.split('/')[-1]) except: raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath def decompose_bibdocfile_fullpath(fullpath): """Given a bibdocfile fullpath (e.g. "CFG_BIBDOCFILE_FILEDIR/g0/123/bar.pdf;1") returns a quadruple (recid, docname, format, version).""" if not fullpath.startswith(os.path.join(CFG_BIBDOCFILE_FILEDIR, 'g')): raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath dirname, dummy, extension, version = decompose_file_with_version(fullpath) try: docid = int(dirname.split('/')[-1]) return {"doc_id" : docid, "extension": extension, "version": version} except: raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath def decompose_bibdocfile_url(url): """Given a bibdocfile_url return a triple (recid, docname, format).""" if url.startswith('%s/getfile.py' % CFG_SITE_URL) or url.startswith('%s/getfile.py' % CFG_SITE_SECURE_URL): return decompose_bibdocfile_very_old_url(url) if url.startswith('%s/%s/' % (CFG_SITE_URL, CFG_SITE_RECORD)): recid_file = url[len('%s/%s/' % (CFG_SITE_URL, CFG_SITE_RECORD)):] elif url.startswith('%s/%s/' % (CFG_SITE_SECURE_URL, CFG_SITE_RECORD)): recid_file = url[len('%s/%s/' % (CFG_SITE_SECURE_URL, CFG_SITE_RECORD)):] else: raise InvenioBibDocFileError, "Url %s doesn't correspond to a valid record inside the system." % url recid_file = recid_file.replace('/files/', '/') recid, docname, docformat = decompose_file(urllib.unquote(recid_file)) # this will work in the case of URL... not file ! if not recid and docname.isdigit(): ## If the URL was something similar to CFG_SITE_URL/CFG_SITE_RECORD/123 return (int(docname), '', '') return (int(recid), docname, docformat) re_bibdocfile_old_url = re.compile(r'/%s/(\d*)/files/' % CFG_SITE_RECORD) def decompose_bibdocfile_old_url(url): """Given a bibdocfile old url (e.g. CFG_SITE_URL/CFG_SITE_RECORD/123/files) it returns the recid.""" g = re_bibdocfile_old_url.search(url) if g: return int(g.group(1)) raise InvenioBibDocFileError('%s is not a valid old bibdocfile url' % url) def decompose_bibdocfile_very_old_url(url): """Decompose an old /getfile.py? URL""" if url.startswith('%s/getfile.py' % CFG_SITE_URL) or url.startswith('%s/getfile.py' % CFG_SITE_SECURE_URL): params = urllib.splitquery(url)[1] if params: try: params = cgi.parse_qs(params) if 'docid' in params: docid = int(params['docid'][0]) bibdoc = BibDoc.create_instance(docid) if bibdoc.bibrec_links: recid = bibdoc.bibrec_links[0]["rec_id"] docname = bibdoc.bibrec_links[0]["doc_name"] else: raise InvenioBibDocFileError("Old style URL pointing to an unattached document") elif 'recid' in params: recid = int(params['recid'][0]) if 'name' in params: docname = params['name'][0] else: docname = '' else: raise InvenioBibDocFileError('%s has not enough params to correspond to a bibdocfile.' % url) docformat = normalize_format(params.get('format', [''])[0]) return (recid, docname, docformat) except Exception, e: raise InvenioBibDocFileError('Problem with %s: %s' % (url, e)) else: raise InvenioBibDocFileError('%s has no params to correspond to a bibdocfile.' % url) else: raise InvenioBibDocFileError('%s is not a valid very old bibdocfile url' % url) def get_docname_from_url(url): """Return a potential docname given a url""" path = urllib2.urlparse.urlsplit(urllib.unquote(url))[2] filename = os.path.split(path)[-1] return file_strip_ext(filename) def get_format_from_url(url): """Return a potential format given a url""" path = urllib2.urlparse.urlsplit(urllib.unquote(url))[2] filename = os.path.split(path)[-1] return filename[len(file_strip_ext(filename)):] def clean_url(url): """Given a local url e.g. a local path it render it a realpath.""" if is_url_a_local_file(url): path = urllib2.urlparse.urlsplit(urllib.unquote(url))[2] return os.path.abspath(path) else: return url def is_url_a_local_file(url): """Return True if the given URL is pointing to a local file.""" protocol = urllib2.urlparse.urlsplit(url)[0] return protocol in ('', 'file') def check_valid_url(url): """ Check for validity of a url or a file. @param url: the URL to check @type url: string @raise StandardError: if the URL is not a valid URL. """ try: if is_url_a_local_file(url): path = urllib2.urlparse.urlsplit(urllib.unquote(url))[2] if os.path.abspath(path) != path: raise StandardError, "%s is not a normalized path (would be %s)." % (path, os.path.normpath(path)) for allowed_path in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS + [CFG_TMPDIR, CFG_TMPSHAREDDIR, CFG_WEBSUBMIT_STORAGEDIR]: if path.startswith(allowed_path): dummy_fd = open(path) dummy_fd.close() return raise StandardError, "%s is not in one of the allowed paths." % path else: try: open_url(url) except InvenioBibdocfileUnauthorizedURL, e: raise StandardError, str(e) except Exception, e: raise StandardError, "%s is not a correct url: %s" % (url, e) def safe_mkstemp(suffix, prefix='bibdocfile_'): """Create a temporary filename that don't have any '.' inside a part from the suffix.""" tmpfd, tmppath = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=CFG_TMPDIR) # Close the file and leave the responsability to the client code to # correctly open/close it. os.close(tmpfd) if '.' not in suffix: # Just in case format is empty return tmppath while '.' in os.path.basename(tmppath)[:-len(suffix)]: os.remove(tmppath) tmpfd, tmppath = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=CFG_TMPDIR) os.close(tmpfd) return tmppath def download_local_file(filename, docformat=None): """ Copies a local file to Invenio's temporary directory. @param filename: the name of the file to copy @type filename: string @param format: the format of the file to copy (will be found if not specified) @type format: string @return: the path of the temporary file created @rtype: string @raise StandardError: if something went wrong """ # Make sure the format is OK. if docformat is None: docformat = guess_format_from_url(filename) else: docformat = normalize_format(docformat) tmppath = '' # Now try to copy. try: path = urllib2.urlparse.urlsplit(urllib.unquote(filename))[2] if os.path.abspath(path) != path: raise StandardError, "%s is not a normalized path (would be %s)." \ % (path, os.path.normpath(path)) for allowed_path in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS + [CFG_TMPDIR, CFG_WEBSUBMIT_STORAGEDIR]: if path.startswith(allowed_path): tmppath = safe_mkstemp(docformat) shutil.copy(path, tmppath) if os.path.getsize(tmppath) == 0: os.remove(tmppath) raise StandardError, "%s seems to be empty" % filename break else: raise StandardError, "%s is not in one of the allowed paths." % path except Exception, e: raise StandardError, "Impossible to copy the local file '%s': %s" % \ (filename, str(e)) return tmppath def download_external_url(url, docformat=None): """ Download a url (if it corresponds to a remote file) and return a local url to it. @param url: the URL to download @type url: string @param format: the format of the file (will be found if not specified) @type format: string @return: the path to the download local file @rtype: string @raise StandardError: if the download failed """ tmppath = None # Make sure the format is OK. if docformat is None: # First try to find a known extension to the URL docformat = decompose_file(url, skip_version=True, only_known_extensions=True)[2] if not docformat: # No correct format could be found. Will try to get it from the # HTTP message headers. docformat = '' else: docformat = normalize_format(docformat) from_file, to_file, tmppath = None, None, '' try: from_file = open_url(url) except InvenioBibdocfileUnauthorizedURL, e: raise StandardError, str(e) except urllib2.URLError, e: raise StandardError, 'URL could not be opened: %s' % str(e) if not docformat: # We could not determine the format from the URL, so let's try # to read it from the HTTP headers. docformat = get_format_from_http_response(from_file) try: tmppath = safe_mkstemp(docformat) to_file = open(tmppath, 'w') while True: block = from_file.read(CFG_BIBDOCFILE_BLOCK_SIZE) if not block: break to_file.write(block) to_file.close() from_file.close() if os.path.getsize(tmppath) == 0: raise StandardError, "%s seems to be empty" % url except Exception, e: # Try to close and remove the temporary file. try: to_file.close() except Exception: pass try: os.remove(tmppath) except Exception: pass raise StandardError, "Error when downloading %s into %s: %s" % \ (url, tmppath, e) return tmppath def get_format_from_http_response(response): """ Tries to retrieve the format of the file from the message headers of the HTTP response. @param response: the HTTP response @type response: file-like object (as returned by urllib.urlopen) @return: the format of the remote resource @rtype: string """ def parse_content_type(text): return text.split(';')[0].strip() def parse_content_disposition(text): for item in text.split(';'): item = item.strip() if item.strip().startswith('filename='): return item[len('filename="'):-len('"')] info = response.info() docformat = '' content_disposition = info.getheader('Content-Disposition') if content_disposition: filename = parse_content_disposition(content_disposition) if filename: docformat = decompose_file(filename, only_known_extensions=False)[2] if docformat: return docformat content_type = info.getheader('Content-Type') if content_type: content_type = parse_content_type(content_type) if content_type not in ('text/plain', 'application/octet-stream'): ## We actually ignore these mimetypes since they are the ## defaults often returned by Apache in case the mimetype ## was not known ext = _mimes.guess_extension(content_type) if ext: ## Normalize some common magic mis-interpreation ext = {'.asc': '.txt', '.obj': '.bin'}.get(ext, ext) docformat = normalize_format(ext) return docformat def download_url(url, docformat=None): """ Download a url (if it corresponds to a remote file) and return a local url to it. """ tmppath = None try: if is_url_a_local_file(url): tmppath = download_local_file(url, docformat = docformat) else: tmppath = download_external_url(url, docformat = docformat) except StandardError: raise return tmppath class MoreInfo(object): """This class represents a genering MoreInfo dictionary. MoreInfo object can be attached to bibdoc, bibversion, format or BibRelation. The entity where a particular MoreInfo object is attached has to be specified using the constructor parametes. This class is a thin wrapper around the database table. """ def __init__(self, docid = None, version = None, docformat = None, relation = None, cache_only = False, cache_reads = True, initial_data = None): """ @param cache_only Determines if MoreInfo object should be created in memory only or reflected in the database @type cache_only boolean @param cache_reads Determines if reads should be executed on the in-memory cache or should be redirected to the database. If this is true, cache can be entirely regenerated from the database only upon an explicit request. If the value is not present in the cache, the database is queried @type cache_reads boolean @param initial_data Allows to specify initial content of the cache. This parameter is useful when we create an in-memory instance from serialised value @type initial_data string """ self.docid = docid self.version = version self.format = docformat self.relation = relation self.cache_only = cache_only if initial_data != None: self.cache = initial_data self.dirty = initial_data if not self.cache_only: self._flush_cache() #inserts new entries else: self.cache = {} self.dirty = {} self.cache_reads = cache_reads if not self.cache_only: self.populate_from_database() @staticmethod def create_from_serialised(ser_str, docid = None, version = None, docformat = None, relation = None, cache_only = False, cache_reads = True): """Creates an instance of MoreInfo using serialised data as the cache content""" data = cPickle.loads(base64.b64decode(ser_str)) return MoreInfo(docid = docid, version = version, docformat = docformat, relation = relation, cache_only = cache_only, cache_reads = cache_reads, initial_data = data); def serialise_cache(self): """Returns a serialised representation of the cache""" return base64.b64encode(cPickle.dumps(self.get_cache())) def populate_from_database(self): """Retrieves all values of MoreInfo and places them in the cache""" where_str, where_args = self._generate_where_query_args() query_str = "SELECT namespace, data_key, data_value FROM bibdocmoreinfo WHERE %s" % (where_str, ) res = run_sql(query_str, where_args) if res: for row in res: namespace, data_key, data_value_ser = row data_value = cPickle.loads(data_value_ser) if not namespace in self.cache: self.cache[namespace] = {} self.cache[namespace][data_key] = data_value def _mark_dirty(self, namespace, data_key): """Marks a data key dirty - that should be saved into the database""" if not namespace in self.dirty: self.dirty[namespace] = {} self.dirty[namespace][data_key] = True def _database_get_distinct_string_list(self, column, namespace = None): """A private method reading an unique list of strings from the moreinfo database table""" where_str, where_args = self._generate_where_query_args( namespace = namespace) query_str = "SELECT DISTINCT %s FROM bibdocmoreinfo WHERE %s" % \ ( column, where_str, ) if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(where_args)) print "Executing query: " + query_str + " ARGS: " + repr(where_args) res = run_sql(query_str, where_args) return (res and [x[0] for x in res]) or [] # after migrating to python 2.6, can be rewritten using x if y else z syntax: return [x[0] for x in res] if res else [] def _database_get_namespaces(self): """Read the database to discover namespaces declared in a given MoreInfo""" return self._database_get_distinct_string_list("namespace") def _database_get_keys(self, namespace): """Returns all keys assigned in a given namespace of a MoreInfo instance""" return self._database_get_distinct_string_list("data_key", namespace=namespace) def _database_contains_key(self, namespace, key): return self._database_read_value(namespace, key) != None def _database_save_value(self, namespace, key, value): """Write changes into the database""" #TODO: this should happen within one transaction serialised_val = cPickle.dumps(value) # on duplicate key will not work here as miltiple null values are permitted by the index if not self._database_contains_key(namespace, key): #insert new value query_parts = [] query_args = [] to_process = [(self.docid, "id_bibdoc"), (self.version, "version"), (self.format, "format"), (self.relation, "id_rel"), (str(namespace), "namespace"), (str(key), "data_key"), (str(serialised_val), "data_value")] for entry in to_process: _val_or_null(entry[0], q_str = query_parts, q_args = query_args) columns_str = ", ".join(map(lambda x: x[1], to_process)) values_str = ", ".join(query_parts) query_str = "INSERT INTO bibdocmoreinfo (%s) VALUES(%s)" % \ (columns_str, values_str) if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(query_args)) print "Executing query: " + query_str + " ARGS: " + repr(query_args) run_sql(query_str, query_args) else: #Update existing value where_str, where_args = self._generate_where_query_args(namespace, key) query_str = "UPDATE bibdocmoreinfo SET data_value=%s WHERE " + where_str query_args = [str(serialised_val)] + where_args if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(query_args)) print "Executing query: " + query_str + " ARGS: " + repr(query_args) run_sql(query_str, query_args ) def _database_read_value(self, namespace, key): """Reads a value directly from the database @param namespace - namespace of the data to be read @param key - key of the data to be read """ where_str, where_args = self._generate_where_query_args(namespace = namespace, data_key = key) query_str = "SELECT data_value FROM bibdocmoreinfo WHERE " + where_str res = run_sql(query_str, where_args) if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(where_args) + "WITH THE RESULT: " + str(res)) s_ = "" if res: s_ = cPickle.loads(res[0][0]) print "Executing query: " + query_str + " ARGS: " + repr(where_args) + " WITH THE RESULT: " + str(s_) if res and res[0][0]: try: return cPickle.loads(res[0][0]) except: raise Exception("Error when deserialising value for %s key=%s retrieved value=%s" % (repr(self), str(key), str(res[0][0]))) return None def _database_remove_value(self, namespace, key): """Removes an entry directly in the database""" where_str, where_args = self._generate_where_query_args(namespace = namespace, data_key = key) query_str = "DELETE FROM bibdocmoreinfo WHERE " + where_str if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(where_args)) print "Executing query: " + query_str + " ARGS: " + repr(where_args) run_sql(query_str, where_args) return None def _flush_cache(self): """Writes all the dirty cache entries into the database""" for namespace in self.dirty: for data_key in self.dirty[namespace]: if namespace in self.cache and data_key in self.cache[namespace]\ and not self.cache[namespace][data_key] is None: self._database_save_value(namespace, data_key, self.cache[namespace][data_key]) else: # This might happen if a value has been removed from the cache self._database_remove_value(namespace, data_key) self.dirty = {} def _generate_where_query_args(self, namespace = None, data_key = None): """Private method generating WHERE clause of SQL statements""" ns = [] if namespace != None: ns = [(namespace, "namespace")] dk = [] if data_key != None: dk = [(data_key, "data_key")] to_process = [(self.docid, "id_bibdoc"), (self.version, "version"), (self.format, "format"), (self.relation, "id_rel")] + \ ns + dk return _sql_generate_conjunctive_where(to_process) def set_data(self, namespace, key, value): """setting data directly in the database dictionary""" if not namespace in self.cache: self.cache[namespace] = {} self.cache[namespace][key] = value self._mark_dirty(namespace, key) if not self.cache_only: self._flush_cache() def get_data(self, namespace, key): """retrieving data from the database""" if self.cache_reads or self.cache_only: if namespace in self.cache and key in self.cache[namespace]: return self.cache[namespace][key] if not self.cache_only: # we have a permission to read from the database value = self._database_read_value(namespace, key) if value: if not namespace in self.cache: self.cache[namespace] = {} self.cache[namespace][key] = value return value return None def del_key(self, namespace, key): """retrieving data from the database""" if not namespace in self.cache: return None del self.cache[namespace][key] self._mark_dirty(namespace, key) if not self.cache_only: self._flush_cache() def contains_key(self, namespace, key): return self.get_data(namespace, key) != None # the dictionary interface -> updating the default namespace def __setitem__(self, key, value): self.set_data("", key, value) #the default value def __getitem__(self, key): return self.get_data("", key) def __delitem__(self, key): self.del_key("", key) def __contains__(self, key): return self.contains_key("", key) def __repr__(self): return "MoreInfo(docid=%s, version=%s, docformat=%s, relation=%s)" % \ (self.docid, self.version, self.format, self.relation) def delete(self): """Remove all entries associated with this MoreInfo""" self.cache = {} if not self.cache_only: where_str, query_args = self._generate_where_query_args() query_str = "DELETE FROM bibdocmoreinfo WHERE %s" % (where_str, ) if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(query_args)) print "Executing query: " + query_str + " ARGS: " + repr(query_args) run_sql(query_str, query_args) def get_cache(self): """Returns the content of the cache @return The content of the MoreInfo cache @rtype dictionary {namespace: {key1: value1, ... }, namespace2: {}} """ return self.cache def get_namespaces(self): """Returns a list of namespaces present in the MoreInfo structure. If the object is permitted access to the database, the data should be always read from there. Unlike when reading a particular value, we can not check if value is missing in the cache """ if self.cache_only and self.cache_reads: return self.cache.keys() return self._database_get_namespaces() def get_keys(self, namespace): """Returns a list of keys present in a given namespace""" if self.cache_only and self.cache_reads: res = [] if namespace in self.cache: res = self.cache[namespace].keys() return res else: return self._database_get_keys(namespace) def flush(self): """Flush the content into the database""" self._flush_cache() class BibDocMoreInfo(MoreInfo): """ This class wraps contextual information of the documents, such as the - comments - descriptions - flags. Such information is kept separately per every format/version instance of the corresponding document and is searialized in the database, ready to be retrieved (but not searched). @param docid: the document identifier. @type docid: integer @param more_info: a serialized version of an already existing more_info object. If not specified this information will be readed from the database, and othewise an empty dictionary will be allocated. @raise ValueError: if docid is not a positive integer. @ivar docid: the document identifier as passed to the constructor. @type docid: integer @ivar more_info: the more_info dictionary that will hold all the additional document information. @type more_info: dict of dict of dict @note: in general this class is never instanciated in client code and never used outside bibdocfile module. @note: this class will be extended in the future to hold all the new auxiliary information about a document. """ def __init__(self, docid, cache_only = False, initial_data = None): if not (type(docid) in (long, int) and docid > 0): raise ValueError("docid is not a positive integer, but %s." % docid) MoreInfo.__init__(self, docid, cache_only = cache_only, initial_data = initial_data) if 'descriptions' not in self: self['descriptions'] = {} if 'comments' not in self: self['comments'] = {} if 'flags' not in self: self['flags'] = {} if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Creating BibDocMoreInfo :" + repr(self["comments"])) print "Creating BibdocMoreInfo :" + repr(self["comments"]) def __repr__(self): """ @return: the canonical string representation of the C{BibDocMoreInfo}. @rtype: string """ return 'BibDocMoreInfo(%i, %s)' % (self.docid, repr(cPickle.dumps(self))) def set_flag(self, flagname, docformat, version): """ Sets a flag. @param flagname: the flag to set (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}). @type flagname: string @param format: the format for which the flag should set. @type format: string @param version: the version for which the flag should set: @type version: integer @raise ValueError: if the flag is not in L{CFG_BIBDOCFILE_AVAILABLE_FLAGS} """ if flagname in CFG_BIBDOCFILE_AVAILABLE_FLAGS: flags = self['flags'] if not flagname in flags: flags[flagname] = {} if not version in flags[flagname]: flags[flagname][version] = {} if not docformat in flags[flagname][version]: flags[flagname][version][docformat] = {} flags[flagname][version][docformat] = True self['flags'] = flags else: raise ValueError, "%s is not in %s" % \ (flagname, CFG_BIBDOCFILE_AVAILABLE_FLAGS) def get_comment(self, docformat, version): """ Returns the specified comment. @param format: the format for which the comment should be retrieved. @type format: string @param version: the version for which the comment should be retrieved. @type version: integer @return: the specified comment. @rtype: string """ try: assert(type(version) is int) docformat = normalize_format(docformat) return self['comments'].get(version, {}).get(docformat) except: register_exception() raise def get_description(self, docformat, version): """ Returns the specified description. @param format: the format for which the description should be retrieved. @type format: string @param version: the version for which the description should be retrieved. @type version: integer @return: the specified description. @rtype: string """ try: assert(type(version) is int) docformat = normalize_format(docformat) return self['descriptions'].get(version, {}).get(docformat) except: register_exception() raise def has_flag(self, flagname, docformat, version): """ Return True if the corresponding has been set. @param flagname: the name of the flag (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}). @type flagname: string @param format: the format for which the flag should be checked. @type format: string @param version: the version for which the flag should be checked. @type version: integer @return: True if the flag is set for the given format/version. @rtype: bool @raise ValueError: if the flagname is not in L{CFG_BIBDOCFILE_AVAILABLE_FLAGS} """ if flagname in CFG_BIBDOCFILE_AVAILABLE_FLAGS: return self['flags'].get(flagname, {}).get(version, {}).get(docformat, False) else: raise ValueError, "%s is not in %s" % (flagname, CFG_BIBDOCFILE_AVAILABLE_FLAGS) def get_flags(self, docformat, version): """ Return the list of all the enabled flags. @param format: the format for which the list should be returned. @type format: string @param version: the version for which the list should be returned. @type version: integer @return: the list of enabled flags (from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}). @rtype: list of string """ return [flag for flag in self['flags'] if docformat in self['flags'][flag].get(version, {})] def set_comment(self, comment, docformat, version): """ Set a comment. @param comment: the comment to be set. @type comment: string @param format: the format for which the comment should be set. @type format: string @param version: the version for which the comment should be set: @type version: integer """ try: assert(type(version) is int and version > 0) docformat = normalize_format(docformat) if comment == KEEP_OLD_VALUE: comment = self.get_comment(docformat, version) or self.get_comment(docformat, version - 1) if not comment: self.unset_comment(docformat, version) return if not version in self['comments']: comments = self['comments'] comments[version] = {} self['comments'] = comments comments = self['comments'] comments[version][docformat] = comment self['comments'] = comments except: register_exception() raise def set_description(self, description, docformat, version): """ Set a description. @param description: the description to be set. @type description: string @param format: the format for which the description should be set. @type format: string @param version: the version for which the description should be set: @type version: integer """ try: assert(type(version) is int and version > 0) docformat = normalize_format(docformat) if description == KEEP_OLD_VALUE: description = self.get_description(docformat, version) or self.get_description(docformat, version - 1) if not description: self.unset_description(docformat, version) return descriptions = self['descriptions'] if not version in descriptions: descriptions[version] = {} descriptions[version][docformat] = description self.set_data("", 'descriptions', descriptions) except: register_exception() raise def unset_comment(self, docformat, version): """ Unset a comment. @param format: the format for which the comment should be unset. @type format: string @param version: the version for which the comment should be unset: @type version: integer """ try: assert(type(version) is int and version > 0) comments = self['comments'] del comments[version][docformat] self['comments'] = comments except KeyError: pass except: register_exception() raise def unset_description(self, docformat, version): """ Unset a description. @param format: the format for which the description should be unset. @type format: string @param version: the version for which the description should be unset: @type version: integer """ try: assert(type(version) is int and version > 0) descriptions = self['descriptions'] del descriptions[version][docformat] self['descriptions'] = descriptions except KeyError: pass except: register_exception() raise def unset_flag(self, flagname, docformat, version): """ Unset a flag. @param flagname: the flag to be unset (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}). @type flagname: string @param format: the format for which the flag should be unset. @type format: string @param version: the version for which the flag should be unset: @type version: integer @raise ValueError: if the flag is not in L{CFG_BIBDOCFILE_AVAILABLE_FLAGS} """ if flagname in CFG_BIBDOCFILE_AVAILABLE_FLAGS: try: flags = self['flags'] del flags[flagname][version][docformat] self['flags'] = flags except KeyError: pass else: raise ValueError, "%s is not in %s" % (flagname, CFG_BIBDOCFILE_AVAILABLE_FLAGS) _bib_relation__any_value = -1 class BibRelation(object): """ A representation of a relation between documents or their particular versions """ def __init__(self, rel_type = None, bibdoc1_id = None, bibdoc2_id = None, bibdoc1_ver = None, bibdoc2_ver = None, bibdoc1_fmt = None, bibdoc2_fmt = None, rel_id = None): """ The constructor of the class representing a relation between two documents. If the more_info parameter is specified, no data is retrieved from the database and the internal dictionary is initialised with the passed value. If the more_info is not provided, the value is read from the database. In the case of non-existing record, an empty dictionary is assigned. If a version of whichever record is not specified, the resulting object desctibes a relation of all version of a given BibDoc. @param bibdoc1 @type bibdoc1 BibDoc @param bibdoc1_ver @type version1_ver int @param bibdoc2 @type bibdoc2 BibDco @param bibdoc2_ver @type bibdoc2_ver int @param bibdoc1_fmt format of the first document @type bibdoc1_fmt string @param bibdoc2_fmt format of the second document @type bibdoc2_fmt string @param rel_type @type rel_type string @param more_info The serialised representation of the more_info @type more_info string @param rel_id allows to specify the identifier of the newly created relation @type rel_ide unsigned int """ self.id = rel_id self.bibdoc1_id = bibdoc1_id self.bibdoc2_id = bibdoc2_id self.bibdoc1_ver = bibdoc1_ver self.bibdoc2_ver = bibdoc2_ver self.bibdoc1_fmt = bibdoc1_fmt self.bibdoc2_fmt = bibdoc2_fmt self.rel_type = rel_type if rel_id == None: self._fill_id_from_data() else: self._fill_data_from_id() self.more_info = MoreInfo(relation = self.id) def _fill_data_from_id(self): """Fill all the relation data from the relation identifier """ query = "SELECT id_bibdoc1, version1, format1, id_bibdoc2, version2, format2, rel_type FROM bibdoc_bibdoc WHERE id=%s" res = run_sql(query, (str(self.id), )) if res != None and res[0] != None: self.bibdoc1_id = res[0][0] self.bibdoc1_ver = res[0][1] self.bibdoc1_fmt = res[0][2] self.bibdoc2_id = res[0][3] self.bibdoc2_ver = res[0][4] self.bibdoc2_fmt = res[0][5] self.rel_type = res[0][6] def _fill_id_from_data(self): """Fill the relation identifier based on the data provided""" where_str, where_args = self._get_where_clauses() query = "SELECT id FROM bibdoc_bibdoc WHERE %s" % (where_str, ) res = run_sql(query, where_args) if res and res[0][0]: self.id = int(res[0][0]) def _get_value_column_mapping(self): """ Returns a list of tuples each tuple consists of a value and a name of a database column where this value should fit """ return [(self.rel_type, "rel_type"), (self.bibdoc1_id, "id_bibdoc1"), (self.bibdoc1_ver, "version1"), (self.bibdoc1_fmt, "format1"), (self.bibdoc2_id, "id_bibdoc2"), (self.bibdoc2_ver, "version2"), (self.bibdoc2_fmt, "format2")] def _get_where_clauses(self): """Private function returning part of the SQL statement identifying current relation @return @rtype tuple """ return _sql_generate_conjunctive_where(self._get_value_column_mapping()) @staticmethod def create(bibdoc1_id = None, bibdoc1_ver = None, bibdoc1_fmt = None, bibdoc2_id = None, bibdoc2_ver = None, bibdoc2_fmt = None, rel_type = ""): """ Create a relation and return instance. Ommiting an argument means that a particular relation concerns any value of the parameter """ # check if there is already entry corresponding to parameters existing = BibRelation.get_relations(rel_type = rel_type, bibdoc1_id = bibdoc1_id, bibdoc2_id = bibdoc2_id, bibdoc1_ver = bibdoc1_ver, bibdoc2_ver = bibdoc2_ver, bibdoc1_fmt = bibdoc1_fmt, bibdoc2_fmt = bibdoc2_fmt) if len(existing) > 0: return existing[0] # build the insert query and execute it to_process = [(rel_type, "rel_type"), (bibdoc1_id, "id_bibdoc1"), (bibdoc1_ver, "version1"), (bibdoc1_fmt, "format1"), (bibdoc2_id, "id_bibdoc2"), (bibdoc2_ver, "version2"), (bibdoc2_fmt, "format2")] values_list = [] args_list = [] columns_list = [] for entry in to_process: columns_list.append(entry[1]) if entry[0] == None: values_list.append("NULL") else: values_list.append("%s") args_list.append(entry[0]) query = "INSERT INTO bibdoc_bibdoc (%s) VALUES (%s)" % (", ".join(columns_list), ", ".join(values_list)) # print "Query: %s Args: %s" % (query, str(args_list)) rel_id = run_sql(query, args_list) return BibRelation(rel_id = rel_id) def delete(self): """ Removes a relation between objects from the database. executing the flush function on the same object will restore the relation """ where_str, where_args = self._get_where_clauses() run_sql("DELETE FROM bibdoc_bibdoc WHERE %s" % (where_str,), where_args) # kwalitee: disable=sql # removing associated MoreInfo self.more_info.delete() def get_more_info(self): return self.more_info @staticmethod def get_relations(rel_type = _bib_relation__any_value, bibdoc1_id = _bib_relation__any_value, bibdoc2_id = _bib_relation__any_value, bibdoc1_ver = _bib_relation__any_value, bibdoc2_ver = _bib_relation__any_value, bibdoc1_fmt = _bib_relation__any_value, bibdoc2_fmt = _bib_relation__any_value): """Retrieves list of relations satisfying condtions. If a parameter is specified, its value has to match exactly. If a parameter is ommited, any of its values will be accepted""" to_process = [(rel_type, "rel_type"), (bibdoc1_id, "id_bibdoc1"), (bibdoc1_ver, "version1"), (bibdoc1_fmt, "format1"), (bibdoc2_id, "id_bibdoc2"), (bibdoc2_ver, "version2"), (bibdoc2_fmt, "format2")] where_str, where_args = _sql_generate_conjunctive_where( filter(lambda x: x[0] != _bib_relation__any_value, to_process)) if where_str: where_str = "WHERE " + where_str # in case of nonempty where, we need a where clause query_str = "SELECT id FROM bibdoc_bibdoc %s" % (where_str, ) # print "running query : %s with arguments %s on the object %s" % (query_str, str(where_args), repr(self)) try: res = run_sql(query_str, where_args) except: raise Exception(query_str + " " + str(where_args)) results = [] if res != None: for res_row in res: results.append(BibRelation(rel_id=res_row[0])) return results # Access to MoreInfo def set_data(self, category, key, value): """assign additional information to this relation""" self.more_info.set_data(category, key, value) def get_data(self, category, key): """read additional information assigned to this relation""" return self.more_info.get_data(category, key) #the dictionary interface allowing to set data bypassing the namespaces def __setitem__(self, key, value): self.more_info[key] = value def __getitem__(self, key): return self.more_info[key] def __contains__(self, key): return self.more_info.__contains__(key) def __repr__(self): return "BibRelation(id_bibdoc1 = %s, version1 = %s, format1 = %s, id_bibdoc2 = %s, version2 = %s, format2 = %s, rel_type = %s)" % \ (self.bibdoc1_id, self.bibdoc1_ver, self.bibdoc1_fmt, self.bibdoc2_id, self.bibdoc2_ver, self.bibdoc2_fmt, self.rel_type) def readfile(filename): """ Read a file. @param filename: the name of the file to be read. @type filename: string @return: the text contained in the file. @rtype: string @note: Returns empty string in case of any error. @note: this function is useful for quick implementation of websubmit functions. """ try: return open(filename).read() except Exception: return '' class HeadRequest(urllib2.Request): """ A request object to perform a HEAD request. """ def get_method(self): return 'HEAD' def read_cookie(cookiefile): """ Parses a cookie file and returns a string as needed for the urllib2 headers The file should respect the Netscape cookie specifications """ cookie_data = '' cfile = open(cookiefile, 'r') for line in cfile.readlines(): tokens = line.split('\t') if len(tokens) == 7: # we are on a cookie line cookie_data += '%s=%s; ' % (tokens[5], tokens[6].replace('\n', '')) cfile.close() return cookie_data def open_url(url, headers=None, head_request=False): """ Opens a URL. If headers are passed as argument, no check is performed and the URL will be opened. Otherwise checks if the URL is present in CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS and uses the headers specified in the config variable. @param url: the URL to open @type url: string @param headers: the headers to use @type headers: dictionary @param head_request: if True, perform a HEAD request, otherwise a POST request @type head_request: boolean @return: a file-like object as returned by urllib2.urlopen. """ headers_to_use = None if headers is None: for regex, headers in _CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS: if regex.match(url) is not None: headers_to_use = headers break if headers_to_use is None: # URL is not allowed. raise InvenioBibdocfileUnauthorizedURL, "%s is not an authorized " \ "external URL." % url else: headers_to_use = headers request_obj = head_request and HeadRequest or urllib2.Request request = request_obj(url) request.add_header('User-Agent', make_user_agent_string('bibdocfile')) for key, value in headers_to_use.items(): try: value = globals()[value['fnc']](**value['args']) except (KeyError, TypeError): pass request.add_header(key, value) return urllib2.urlopen(request) def update_modification_date_of_file(filepath, modification_date): """Update the modification time and date of the file with the modification_date @param filepath: the full path of the file that needs to be updated @type filepath: string @param modification_date: the new modification date and time @type modification_date: datetime.datetime object """ try: modif_date_in_seconds = time.mktime(modification_date.timetuple()) # try to get the time in seconds except (AttributeError, TypeError): modif_date_in_seconds = 0 if modif_date_in_seconds: statinfo = os.stat(filepath) # we need to keep the same access time os.utime(filepath, (statinfo.st_atime, modif_date_in_seconds)) #update the modification time diff --git a/modules/miscutil/lib/inveniocfg.py b/modules/miscutil/lib/inveniocfg.py index 031537d73..d88d9e1af 100644 --- a/modules/miscutil/lib/inveniocfg.py +++ b/modules/miscutil/lib/inveniocfg.py @@ -1,1646 +1,1647 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Invenio configuration and administration CLI tool. Usage: inveniocfg [options] General options: -h, --help print this help -V, --version print version number Options to finish your installation: --create-apache-conf create Apache configuration files --create-tables create DB tables for Invenio --load-bibfield-conf load the BibField configuration --load-webstat-conf load the WebStat configuration --drop-tables drop DB tables of Invenio --check-openoffice check for correctly set up of openoffice temporary directory Options to set up and test a demo site: --create-demo-site create demo site --load-demo-records load demo records --remove-demo-records remove demo records, keeping demo site --drop-demo-site drop demo site configurations too --run-unit-tests run unit test suite (needs demo site) --run-regression-tests run regression test suite (needs demo site) --run-web-tests run web tests in a browser (needs demo site, Firefox, Selenium IDE) Options to update config files in situ: --update-all perform all the update options --update-config-py update config.py file from invenio.conf file --update-dbquery-py update dbquery.py with DB credentials from invenio.conf --update-dbexec update dbexec with DB credentials from invenio.conf --update-bibconvert-tpl update bibconvert templates with CFG_SITE_URL from invenio.conf --update-web-tests update web test cases with CFG_SITE_URL from invenio.conf Options to update DB tables: --reset-all perform all the reset options --reset-sitename reset tables to take account of new CFG_SITE_NAME* --reset-siteadminemail reset tables to take account of new CFG_SITE_ADMIN_EMAIL --reset-fieldnames reset tables to take account of new I18N names from PO files --reset-recstruct-cache reset record structure cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE --reset-recjson-cache reset record json cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE Options to upgrade your installation: --upgrade apply all pending upgrades --upgrade-check run pre-upgrade checks for all pending upgrades --upgrade-show-pending show pending upgrades ready to be applied --upgrade-show-applied show history of applied upgrades --upgrade-create-standard-recipe create a new upgrade recipe (for developers) --upgrade-create-release-recipe create a new release upgrade recipe (for developers) Options to help the work: --list print names and values of all options from conf files --get get value of a given option from conf files --conf-dir path to directory where invenio*.conf files are [optional] --detect-system-details print system details such as Apache/Python/MySQL versions """ __revision__ = "$Id$" from ConfigParser import ConfigParser from optparse import OptionParser, OptionGroup, IndentedHelpFormatter, Option, \ OptionError import os import re import shutil import socket import sys def print_usage(): """Print help.""" print __doc__ def get_version(): """ Get running version of Invenio """ from invenio.config import CFG_VERSION return CFG_VERSION def print_version(): """Print version information.""" print get_version() def convert_conf_option(option_name, option_value): """ Convert conf option into Python config.py line, converting values to ints or strings as appropriate. """ ## 1) convert option name to uppercase: option_name = option_name.upper() ## 1a) adjust renamed variables: if option_name in ['CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_DOCTYPES', 'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_RESTRICTIONS', 'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_MISC', 'CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT', 'CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS', 'CFG_WEBSUBMIT_DESIRED_CONVERSIONS']: new_option_name = option_name.replace('WEBSUBMIT', 'BIBDOCFILE') print >> sys.stderr, ("""WARNING: %s has been renamed to %s. Please, update your invenio-local.conf file accordingly.""" % (option_name, new_option_name)) option_name = new_option_name ## 2) convert option value to int or string: if option_name in ['CFG_BIBUPLOAD_REFERENCE_TAG', 'CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG', 'CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG', 'CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG', 'CFG_BIBUPLOAD_STRONG_TAGS', 'CFG_BIBFORMAT_HIDDEN_TAGS',]: # some options are supposed be string even when they look like # numeric option_value = '"' + option_value + '"' else: try: option_value = int(option_value) except ValueError: option_value = '"' + option_value + '"' ## 3a) special cases: chars regexps if option_name in ['CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS', 'CFG_BIBINDEX_CHARS_PUNCTUATION']: option_value = 'r"[' + option_value[1:-1] + ']"' ## 3abis) special cases: real regexps if option_name in ['CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES', 'CFG_BATCHUPLOADER_WEB_ROBOT_AGENTS']: option_value = 'r"' + option_value[1:-1] + '"' ## 3b) special cases: True, False, None if option_value in ['"True"', '"False"', '"None"']: option_value = option_value[1:-1] ## 3c) special cases: dicts and real pythonic lists if option_name in ['CFG_WEBSEARCH_FIELDS_CONVERT', 'CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS', 'CFG_WEBSEARCH_FULLTEXT_SNIPPETS', 'CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS', 'CFG_SITE_EMERGENCY_EMAIL_ADDRESSES', 'CFG_BIBMATCH_FUZZY_WORDLIMITS', 'CFG_BIBMATCH_QUERY_TEMPLATES', 'CFG_WEBSEARCH_SYNONYM_KBRS', 'CFG_BIBINDEX_SYNONYM_KBRS', 'CFG_WEBCOMMENT_EMAIL_REPLIES_TO', 'CFG_WEBCOMMENT_RESTRICTION_DATAFIELD', 'CFG_WEBCOMMENT_ROUND_DATAFIELD', 'CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS', 'CFG_BIBSCHED_NODE_TASKS', 'CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE', 'CFG_OAI_METADATA_FORMATS', 'CFG_BIBDOCFILE_DESIRED_CONVERSIONS', 'CFG_BIBDOCFILE_BEST_FORMATS_TO_EXTRACT_TEXT_FROM', 'CFG_WEB_API_KEY_ALLOWED_URL', 'CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_MISC', 'CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_DOCTYPES', 'CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_RESTRICTIONS', 'CFG_REFEXTRACT_KBS_OVERRIDE', 'CFG_OPENID_CONFIGURATIONS', 'CFG_OAUTH1_CONFIGURATIONS', - 'CFG_OAUTH2_CONFIGURATIONS',]: + 'CFG_OAUTH2_CONFIGURATIONS', + 'CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES',]: try: option_value = option_value[1:-1] except TypeError: if option_name in ('CFG_WEBSEARCH_FULLTEXT_SNIPPETS',): print >> sys.stderr, """WARNING: CFG_WEBSEARCH_FULLTEXT_SNIPPETS has changed syntax: it can be customised to display different snippets for different document types. See the corresponding documentation in invenio.conf. You may want to customise your invenio-local.conf configuration accordingly.""" option_value = """{'': %s}""" % option_value else: print >> sys.stderr, "ERROR: type error in %s value %s." % \ (option_name, option_value) sys.exit(1) ## 3cbis) very special cases: dicts with backward compatible string if option_name in ['CFG_BIBINDEX_SPLASH_PAGES']: if option_value.startswith('"{') and option_value.endswith('}"'): option_value = option_value[1:-1] else: option_value = """{%s: ".*"}""" % option_value ## 3d) special cases: comma-separated lists if option_name in ['CFG_SITE_LANGS', 'CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS', 'CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS', 'CFG_BIBUPLOAD_STRONG_TAGS', 'CFG_BIBFORMAT_HIDDEN_TAGS', 'CFG_BIBSCHED_GC_TASKS_TO_REMOVE', 'CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE', 'CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS', 'CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS', 'CFG_BIBUPLOAD_DELETE_FORMATS', 'CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES', 'CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST', 'CFG_WEBSEARCH_RSS_I18N_COLLECTIONS', 'CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY', 'CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY', 'CFG_BIBCIRCULATION_ITEM_STATUS_OPTIONAL', 'CFG_PLOTEXTRACTOR_DISALLOWED_TEX', 'CFG_OAI_FRIENDS', 'CFG_WEBSTYLE_REVERSE_PROXY_IPS', 'CFG_BIBEDIT_AUTOCOMPLETE_INSTITUTIONS_FIELDS', 'CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS', 'CFG_BIBFORMAT_HIDDEN_FILE_FORMATS', 'CFG_BIBFIELD_MASTER_FORMATS', 'CFG_OPENID_PROVIDERS', 'CFG_OAUTH1_PROVIDERS', 'CFG_OAUTH2_PROVIDERS',]: out = "[" for elem in option_value[1:-1].split(","): if elem: elem = elem.strip() if option_name in ['CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES']: # 3d1) integer values out += "%i, " % int(elem) else: # 3d2) string values out += "'%s', " % elem out += "]" option_value = out ## 3e) special cases: multiline if option_name == 'CFG_OAI_IDENTIFY_DESCRIPTION': # make triple quotes option_value = '""' + option_value + '""' ## 3f) ignore some options: if option_name.startswith('CFG_SITE_NAME_INTL'): # treated elsewhere return ## 3g) special cases: float if option_name in ['CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY', 'CFG_BIBMATCH_LOCAL_SLEEPTIME', 'CFG_BIBMATCH_REMOTE_SLEEPTIME', 'CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT', 'CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT']: option_value = float(option_value[1:-1]) ## 3h) special cases: bibmatch validation list if option_name in ['CFG_BIBMATCH_MATCH_VALIDATION_RULESETS']: option_value = option_value[1:-1] ## 4a) dropped variables if option_name in ['CFG_BATCHUPLOADER_WEB_ROBOT_AGENT']: print >> sys.stderr, ("""ERROR: CFG_BATCHUPLOADER_WEB_ROBOT_AGENT has been dropped in favour of CFG_BATCHUPLOADER_WEB_ROBOT_AGENTS. Please, update your invenio-local.conf file accordingly.""") option_value = option_value[1:-1] elif option_name in ['CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_DOCTYPES', 'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_RESTRICTIONS', 'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_MISC', 'CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT', 'CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS', 'CFG_WEBSUBMIT_DESIRED_CONVERSIONS']: new_option_name = option_name.replace('WEBSUBMIT', 'BIBDOCFILE') print >> sys.stderr, ("""ERROR: %s has been renamed to %s. Please, update your invenio-local.conf file accordingly.""" % (option_name, new_option_name)) option_name = new_option_name ## 5) finally, return output line: return '%s = %s' % (option_name, option_value) def cli_cmd_update_config_py(conf): """ Update new config.py from conf options, keeping previous config.py in a backup copy. """ print ">>> Going to update config.py..." ## location where config.py is: configpyfile = conf.get("Invenio", "CFG_PYLIBDIR") + \ os.sep + 'invenio' + os.sep + 'config.py' ## backup current config.py file: if os.path.exists(configpyfile): shutil.copy(configpyfile, configpyfile + '.OLD') ## here we go: fdesc = open(configpyfile, 'w') ## generate preamble: fdesc.write("# -*- coding: utf-8 -*-\n") fdesc.write("# DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED\n") fdesc.write("# FROM INVENIO.CONF BY EXECUTING:\n") fdesc.write("# " + " ".join(sys.argv) + "\n") ## special treatment for CFG_SITE_NAME_INTL options: fdesc.write("CFG_SITE_NAME_INTL = {}\n") for lang in conf.get("Invenio", "CFG_SITE_LANGS").split(","): fdesc.write("CFG_SITE_NAME_INTL['%s'] = \"%s\"\n" % (lang, conf.get("Invenio", "CFG_SITE_NAME_INTL_" + lang))) ## special treatment for CFG_SITE_SECURE_URL that may be empty, in ## which case it should be put equal to CFG_SITE_URL: if not conf.get("Invenio", "CFG_SITE_SECURE_URL"): conf.set("Invenio", "CFG_SITE_SECURE_URL", conf.get("Invenio", "CFG_SITE_URL")) ## process all the options normally: sections = conf.sections() sections.sort() for section in sections: options = conf.options(section) options.sort() for option in options: if not option.upper().startswith('CFG_DATABASE_'): # put all options except for db credentials into config.py line_out = convert_conf_option(option, conf.get(section, option)) if line_out: fdesc.write(line_out + "\n") ## FIXME: special treatment for experimental variables ## CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES and CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE ## (not offering them in invenio.conf since they will be refactored) fdesc.write("CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE = 0\n") fdesc.write("CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES = [0, 1,]\n") ## generate postamble: fdesc.write("") fdesc.write("# END OF GENERATED FILE") ## we are done: fdesc.close() print "You may want to restart Apache now." print ">>> config.py updated successfully." def cli_cmd_update_dbquery_py(conf): """ Update lib/dbquery.py file with DB parameters read from conf file. Note: this edits dbquery.py in situ, taking a backup first. Use only when you know what you are doing. """ print ">>> Going to update dbquery.py..." ## location where dbquery.py is: dbqueryconfigpyfile = conf.get("Invenio", "CFG_PYLIBDIR") + \ os.sep + 'invenio' + os.sep + 'dbquery_config.py' ## backup current dbquery.py file: if os.path.exists(dbqueryconfigpyfile + 'c'): shutil.copy(dbqueryconfigpyfile + 'c', dbqueryconfigpyfile + 'c.OLD') out = ["%s = '%s'\n" % (item.upper(), value) \ for item, value in conf.items('Invenio') \ if item.upper().startswith('CFG_DATABASE_')] fdesc = open(dbqueryconfigpyfile, 'w') fdesc.write("# -*- coding: utf-8 -*-\n") fdesc.writelines(out) fdesc.close() print "You may want to restart Apache now." print ">>> dbquery.py updated successfully." def cli_cmd_update_dbexec(conf): """ Update bin/dbexec file with DB parameters read from conf file. Note: this edits dbexec in situ, taking a backup first. Use only when you know what you are doing. """ print ">>> Going to update dbexec..." ## location where dbexec is: dbexecfile = conf.get("Invenio", "CFG_BINDIR") + \ os.sep + 'dbexec' ## backup current dbexec file: if os.path.exists(dbexecfile): shutil.copy(dbexecfile, dbexecfile + '.OLD') ## replace db parameters via sed: out = '' for line in open(dbexecfile, 'r').readlines(): match = re.search(r'^CFG_DATABASE_(HOST|PORT|NAME|USER|PASS|SLAVE)(\s*=\s*)\'.*\'$', line) if match: dbparam = 'CFG_DATABASE_' + match.group(1) out += "%s%s'%s'\n" % (dbparam, match.group(2), conf.get("Invenio", dbparam)) else: out += line fdesc = open(dbexecfile, 'w') fdesc.write(out) fdesc.close() print ">>> dbexec updated successfully." def cli_cmd_update_bibconvert_tpl(conf): """ Update bibconvert/config/*.tpl files looking for 856 http://.../CFG_SITE_RECORD lines, replacing URL with CFG_SITE_URL taken from conf file. Note: this edits tpl files in situ, taking a backup first. Use only when you know what you are doing. """ print ">>> Going to update bibconvert templates..." ## location where bibconvert/config/*.tpl are: tpldir = conf.get("Invenio", 'CFG_ETCDIR') + \ os.sep + 'bibconvert' + os.sep + 'config' ## find all *.tpl files: for tplfilename in os.listdir(tpldir): if tplfilename.endswith(".tpl"): ## change tpl file: tplfile = tpldir + os.sep + tplfilename shutil.copy(tplfile, tplfile + '.OLD') out = '' for line in open(tplfile, 'r').readlines(): match = re.search(r'^(.*)http://.*?/%s/(.*)$' % conf.get("Invenio", 'CFG_SITE_RECORD'), line) if match: out += "%s%s/%s/%s\n" % (match.group(1), conf.get("Invenio", 'CFG_SITE_URL'), conf.get("Invenio", 'CFG_SITE_RECORD'), match.group(2)) else: out += line fdesc = open(tplfile, 'w') fdesc.write(out) fdesc.close() print ">>> bibconvert templates updated successfully." def cli_cmd_update_web_tests(conf): """ Update web test cases lib/webtest/test_*.html looking for http://.+?[>> Going to update web tests..." ## location where test_*.html files are: testdir = conf.get("Invenio", 'CFG_PREFIX') + os.sep + \ 'lib' + os.sep + 'webtest' + os.sep + 'invenio' ## find all test_*.html files: for testfilename in os.listdir(testdir): if testfilename.startswith("test_") and \ testfilename.endswith(".html"): ## change test file: testfile = testdir + os.sep + testfilename shutil.copy(testfile, testfile + '.OLD') out = '' for line in open(testfile, 'r').readlines(): match = re.search(r'^(.*)http://.+?([)/opt/invenio(.*)$', line) if match: out += "%s%s%s\n" % (match.group(1), conf.get("Invenio", 'CFG_PREFIX'), match.group(2)) else: out += line fdesc = open(testfile, 'w') fdesc.write(out) fdesc.close() print ">>> web tests updated successfully." def cli_cmd_reset_sitename(conf): """ Reset collection-related tables with new CFG_SITE_NAME and CFG_SITE_NAME_INTL* read from conf files. """ print ">>> Going to reset CFG_SITE_NAME and CFG_SITE_NAME_INTL..." from invenio.dbquery import run_sql, IntegrityError # reset CFG_SITE_NAME: sitename = conf.get("Invenio", "CFG_SITE_NAME") try: run_sql("""INSERT INTO collection (id, name, dbquery, reclist) VALUES (1,%s,NULL,NULL)""", (sitename,)) except IntegrityError: run_sql("""UPDATE collection SET name=%s WHERE id=1""", (sitename,)) # reset CFG_SITE_NAME_INTL: for lang in conf.get("Invenio", "CFG_SITE_LANGS").split(","): sitename_lang = conf.get("Invenio", "CFG_SITE_NAME_INTL_" + lang) try: run_sql("""INSERT INTO collectionname (id_collection, ln, type, value) VALUES (%s,%s,%s,%s)""", (1, lang, 'ln', sitename_lang)) except IntegrityError: run_sql("""UPDATE collectionname SET value=%s WHERE ln=%s AND id_collection=1 AND type='ln'""", (sitename_lang, lang)) print "You may want to restart Apache now." print ">>> CFG_SITE_NAME and CFG_SITE_NAME_INTL* reset successfully." def cli_cmd_reset_recstruct_cache(conf): """If CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE is changed, this function will adapt the database to either store or not store the recstruct format.""" from invenio.intbitset import intbitset from invenio.dbquery import run_sql, serialize_via_marshal from invenio.search_engine import get_record from invenio.bibsched import server_pid, pidfile enable_recstruct_cache = conf.get("Invenio", "CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE") enable_recstruct_cache = enable_recstruct_cache in ('True', '1') pid = server_pid(ping_the_process=False) if pid: print >> sys.stderr, "ERROR: bibsched seems to run with pid %d, according to %s." % (pid, pidfile) print >> sys.stderr, " Please stop bibsched before running this procedure." sys.exit(1) if enable_recstruct_cache: print ">>> Searching records which need recstruct cache resetting; this may take a while..." all_recids = intbitset(run_sql("SELECT id FROM bibrec")) good_recids = intbitset(run_sql("SELECT bibrec.id FROM bibrec JOIN bibfmt ON bibrec.id = bibfmt.id_bibrec WHERE format='recstruct' AND modification_date < last_updated")) recids = all_recids - good_recids print ">>> Generating recstruct cache..." tot = len(recids) count = 0 for recid in recids: value = serialize_via_marshal(get_record(recid)) run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s AND format='recstruct'", (recid, )) run_sql("INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, 'recstruct', NOW(), %s)", (recid, value)) count += 1 if count % 1000 == 0: print " ... done records %s/%s" % (count, tot) if count % 1000 != 0: print " ... done records %s/%s" % (count, tot) print ">>> recstruct cache generated successfully." else: print ">>> Cleaning recstruct cache..." run_sql("DELETE FROM bibfmt WHERE format='recstruct'") def cli_cmd_reset_recjson_cache(conf): """If CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE is changed, this function will adapt the database to either store or not store the recjson format.""" try: import cPickle as pickle except: import pickle from invenio.intbitset import intbitset from invenio.dbquery import run_sql from invenio.bibfield import get_record from invenio.bibsched import server_pid, pidfile enable_recjson_cache = conf.get("Invenio", "CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE") enable_recjson_cache = enable_recjson_cache in ('True', '1') pid = server_pid(ping_the_process=False) if pid: print >> sys.stderr, "ERROR: bibsched seems to run with pid %d, according to %s." % (pid, pidfile) print >> sys.stderr, " Please stop bibsched before running this procedure." sys.exit(1) if enable_recjson_cache: print ">>> Searching records which need recjson cache resetting; this may take a while..." all_recids = intbitset(run_sql("SELECT id FROM bibrec")) #TODO: prevent doing all records? recids = all_recids print ">>> Generating recjson cache..." tot = len(recids) count = 0 for recid in recids: run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s AND format='recjson'", (recid,)) #TODO: Update the cache or wait for the first access get_record(recid) count += 1 if count % 1000 == 0: print " ... done records %s/%s" % (count, tot) if count % 1000 != 0: print " ... done records %s/%s" % (count, tot) print ">>> recjson cache generated successfully." def cli_cmd_reset_siteadminemail(conf): """ Reset user-related tables with new CFG_SITE_ADMIN_EMAIL read from conf files. """ print ">>> Going to reset CFG_SITE_ADMIN_EMAIL..." from invenio.dbquery import run_sql siteadminemail = conf.get("Invenio", "CFG_SITE_ADMIN_EMAIL") run_sql("DELETE FROM user WHERE id=1") run_sql("""INSERT INTO user (id, email, password, note, nickname) VALUES (1, %s, AES_ENCRYPT(email, ''), 1, 'admin')""", (siteadminemail,)) print "You may want to restart Apache now." print ">>> CFG_SITE_ADMIN_EMAIL reset successfully." def cli_cmd_reset_fieldnames(conf): """ Reset I18N field names such as author, title, etc and other I18N ranking method names such as word similarity. Their translations are taken from the PO files. """ print ">>> Going to reset I18N field names..." from invenio.messages import gettext_set_language, language_list_long from invenio.dbquery import run_sql, IntegrityError ## get field id and name list: field_id_name_list = run_sql("SELECT id, name FROM field") ## get rankmethod id and name list: rankmethod_id_name_list = run_sql("SELECT id, name FROM rnkMETHOD") ## update names for every language: for lang, dummy in language_list_long(): _ = gettext_set_language(lang) ## this list is put here in order for PO system to pick names ## suitable for translation field_name_names = {"any field": _("any field"), "title": _("title"), "author": _("author"), "abstract": _("abstract"), "keyword": _("keyword"), "report number": _("report number"), "subject": _("subject"), "reference": _("reference"), "fulltext": _("fulltext"), "collection": _("collection"), "division": _("division"), "year": _("year"), "journal": _("journal"), "experiment": _("experiment"), "record ID": _("record ID")} ## update I18N names for every language: for (field_id, field_name) in field_id_name_list: if field_name_names.has_key(field_name): try: run_sql("""INSERT INTO fieldname (id_field,ln,type,value) VALUES (%s,%s,%s,%s)""", (field_id, lang, 'ln', field_name_names[field_name])) except IntegrityError: run_sql("""UPDATE fieldname SET value=%s WHERE id_field=%s AND ln=%s AND type=%s""", (field_name_names[field_name], field_id, lang, 'ln',)) ## ditto for rank methods: rankmethod_name_names = {"wrd": _("word similarity"), "demo_jif": _("journal impact factor"), "citation": _("times cited"), "citerank_citation_t": _("time-decay cite count"), "citerank_pagerank_c": _("all-time-best cite rank"), "citerank_pagerank_t": _("time-decay cite rank"),} for (rankmethod_id, rankmethod_name) in rankmethod_id_name_list: if rankmethod_name_names.has_key(rankmethod_name): try: run_sql("""INSERT INTO rnkMETHODNAME (id_rnkMETHOD,ln,type,value) VALUES (%s,%s,%s,%s)""", (rankmethod_id, lang, 'ln', rankmethod_name_names[rankmethod_name])) except IntegrityError: run_sql("""UPDATE rnkMETHODNAME SET value=%s WHERE id_rnkMETHOD=%s AND ln=%s AND type=%s""", (rankmethod_name_names[rankmethod_name], rankmethod_id, lang, 'ln',)) print ">>> I18N field names reset successfully." def cli_check_openoffice(conf): """ If OpenOffice.org integration is enabled, checks whether the system is properly configured. """ from invenio.bibtask import check_running_process_user from invenio.websubmit_file_converter import can_unoconv, get_file_converter_logger logger = get_file_converter_logger() for handler in logger.handlers: logger.removeHandler(handler) check_running_process_user() print ">>> Checking if Libre/OpenOffice.org is correctly integrated...", sys.stdout.flush() if can_unoconv(True): print "ok" else: sys.exit(1) def test_db_connection(): """ Test DB connection, and if fails, advise user how to set it up. Useful to be called during table creation. """ print "Testing DB connection...", from invenio.textutils import wrap_text_in_a_box from invenio.dbquery import run_sql, Error ## first, test connection to the DB server: try: run_sql("SHOW TABLES") except Error, err: from invenio.dbquery import CFG_DATABASE_HOST, CFG_DATABASE_PORT, \ CFG_DATABASE_NAME, CFG_DATABASE_USER, CFG_DATABASE_PASS print wrap_text_in_a_box("""\ DATABASE CONNECTIVITY ERROR %(errno)d: %(errmsg)s.\n Perhaps you need to set up database and connection rights? If yes, then please login as MySQL admin user and run the following commands now: $ mysql -h %(dbhost)s -P %(dbport)s -u root -p mysql mysql> CREATE DATABASE %(dbname)s DEFAULT CHARACTER SET utf8; mysql> GRANT ALL PRIVILEGES ON %(dbname)s.* TO %(dbuser)s@%(webhost)s IDENTIFIED BY '%(dbpass)s'; mysql> QUIT The values printed above were detected from your configuration. If they are not right, then please edit your invenio-local.conf file and rerun 'inveniocfg --update-all' first. If the problem is of different nature, then please inspect the above error message and fix the problem before continuing.""" % \ {'errno': err.args[0], 'errmsg': err.args[1], 'dbname': CFG_DATABASE_NAME, 'dbhost': CFG_DATABASE_HOST, 'dbport': CFG_DATABASE_PORT, 'dbuser': CFG_DATABASE_USER, 'dbpass': CFG_DATABASE_PASS, 'webhost': CFG_DATABASE_HOST == 'localhost' and 'localhost' or os.popen('hostname -f', 'r').read().strip(), }) sys.exit(1) print "ok" ## second, test insert/select of a Unicode string to detect ## possible Python/MySQL/MySQLdb mis-setup: print "Testing Python/MySQL/MySQLdb UTF-8 chain...", try: try: beta_in_utf8 = "β" # Greek beta in UTF-8 is 0xCEB2 run_sql("CREATE TABLE test__invenio__utf8 (x char(1), y varbinary(2)) DEFAULT CHARACTER SET utf8 ENGINE=MyISAM;") run_sql("INSERT INTO test__invenio__utf8 (x, y) VALUES (%s, %s)", (beta_in_utf8, beta_in_utf8)) res = run_sql("SELECT x,y,HEX(x),HEX(y),LENGTH(x),LENGTH(y),CHAR_LENGTH(x),CHAR_LENGTH(y) FROM test__invenio__utf8") assert res[0] == ('\xce\xb2', '\xce\xb2', 'CEB2', 'CEB2', 2L, 2L, 1L, 2L) run_sql("DROP TABLE test__invenio__utf8") except Exception, err: print wrap_text_in_a_box("""\ DATABASE RELATED ERROR %s\n A problem was detected with the UTF-8 treatment in the chain between the Python application, the MySQLdb connector, and the MySQL database. You may perhaps have installed older versions of some prerequisite packages?\n Please check the INSTALL file and please fix this problem before continuing.""" % err) sys.exit(1) finally: run_sql("DROP TABLE IF EXISTS test__invenio__utf8") print "ok" def cli_cmd_create_tables(conf): """Create and fill Invenio DB tables. Useful for the installation process.""" print ">>> Going to create and fill tables..." from invenio.config import CFG_PREFIX test_db_connection() for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/tabcreate.sql" % (CFG_PREFIX, CFG_PREFIX), "%s/bin/dbexec < %s/lib/sql/invenio/tabfill.sql" % (CFG_PREFIX, CFG_PREFIX)]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) cli_cmd_reset_sitename(conf) cli_cmd_reset_siteadminemail(conf) cli_cmd_reset_fieldnames(conf) for cmd in ["%s/bin/webaccessadmin -u admin -c -a" % CFG_PREFIX]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Tables created and filled successfully." def cli_cmd_load_webstat_conf(conf): print ">>> Going to load WebStat config..." from invenio.config import CFG_PREFIX cmd = "%s/bin/webstatadmin --load-config" % CFG_PREFIX if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> WebStat config load successfully." def cli_cmd_load_bibfield_config(conf): print ">>> Going to load BibField config..." from invenio.bibfield_config_engine import BibFieldParser BibFieldParser().write_to_file() print ">>> BibField config load successfully." def cli_cmd_drop_tables(conf): """Drop Invenio DB tables. Useful for the uninstallation process.""" print ">>> Going to drop tables..." from invenio.config import CFG_PREFIX from invenio.textutils import wrap_text_in_a_box, wait_for_user from invenio.webstat import destroy_customevents wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy your database tables!""")) msg = destroy_customevents() if msg: print msg cmd = "%s/bin/dbexec < %s/lib/sql/invenio/tabdrop.sql" % (CFG_PREFIX, CFG_PREFIX) if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Tables dropped successfully." def cli_cmd_create_demo_site(conf): """Create demo site. Useful for testing purposes.""" print ">>> Going to create demo site..." from invenio.config import CFG_PREFIX from invenio.dbquery import run_sql run_sql("TRUNCATE schTASK") run_sql("TRUNCATE session") run_sql("DELETE FROM user WHERE email=''") for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/democfgdata.sql" % \ (CFG_PREFIX, CFG_PREFIX),]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) cli_cmd_reset_fieldnames(conf) # needed for I18N demo ranking method names for cmd in ["%s/bin/webaccessadmin -u admin -c -r -D" % CFG_PREFIX, "%s/bin/webcoll -u admin" % CFG_PREFIX, "%s/bin/webcoll 1" % CFG_PREFIX, "%s/bin/bibsort -u admin --load-config" % CFG_PREFIX, "%s/bin/bibsort 2" % CFG_PREFIX, ]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Demo site created successfully." def cli_cmd_load_demo_records(conf): """Load demo records. Useful for testing purposes.""" from invenio.config import CFG_PREFIX from invenio.dbquery import run_sql print ">>> Going to load demo records..." run_sql("TRUNCATE schTASK") for cmd in ["%s/bin/bibupload -u admin -i %s/var/tmp/demobibdata.xml" % (CFG_PREFIX, CFG_PREFIX), "%s/bin/bibupload 1" % CFG_PREFIX, "%s/bin/bibdocfile --textify --with-ocr --recid 97" % CFG_PREFIX, "%s/bin/bibdocfile --textify --all" % CFG_PREFIX, "%s/bin/bibindex -u admin" % CFG_PREFIX, "%s/bin/bibindex 2" % CFG_PREFIX, "%s/bin/bibreformat -u admin -o HB" % CFG_PREFIX, "%s/bin/bibreformat 3" % CFG_PREFIX, "%s/bin/webcoll -u admin" % CFG_PREFIX, "%s/bin/webcoll 4" % CFG_PREFIX, "%s/bin/bibrank -u admin" % CFG_PREFIX, "%s/bin/bibrank 5" % CFG_PREFIX, "%s/bin/bibsort -u admin -R" % CFG_PREFIX, "%s/bin/bibsort 6" % CFG_PREFIX, "%s/bin/oairepositoryupdater -u admin" % CFG_PREFIX, "%s/bin/oairepositoryupdater 7" % CFG_PREFIX, "%s/bin/bibupload 8" % CFG_PREFIX,]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Demo records loaded successfully." def cli_cmd_remove_demo_records(conf): """Remove demo records. Useful when you are finished testing.""" print ">>> Going to remove demo records..." from invenio.config import CFG_PREFIX from invenio.dbquery import run_sql from invenio.textutils import wrap_text_in_a_box, wait_for_user wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy your records and documents!""")) if os.path.exists(CFG_PREFIX + os.sep + 'var' + os.sep + 'data'): shutil.rmtree(CFG_PREFIX + os.sep + 'var' + os.sep + 'data') run_sql("TRUNCATE schTASK") for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/tabbibclean.sql" % (CFG_PREFIX, CFG_PREFIX), "%s/bin/webcoll -u admin" % CFG_PREFIX, "%s/bin/webcoll 1" % CFG_PREFIX,]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Demo records removed successfully." def cli_cmd_drop_demo_site(conf): """Drop demo site completely. Useful when you are finished testing.""" print ">>> Going to drop demo site..." from invenio.textutils import wrap_text_in_a_box, wait_for_user wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy your site and documents!""")) cli_cmd_drop_tables(conf) cli_cmd_create_tables(conf) cli_cmd_remove_demo_records(conf) print ">>> Demo site dropped successfully." def cli_cmd_run_unit_tests(conf): """Run unit tests, usually on the working demo site.""" from invenio.testutils import build_and_run_unit_test_suite if not build_and_run_unit_test_suite(): sys.exit(1) def cli_cmd_run_js_unit_tests(conf): """Run JavaScript unit tests, usually on the working demo site.""" from invenio.testutils import build_and_run_js_unit_test_suite if not build_and_run_js_unit_test_suite(): sys.exit(1) def cli_cmd_run_regression_tests(conf): """Run regression tests, usually on the working demo site.""" from invenio.testutils import build_and_run_regression_test_suite if not build_and_run_regression_test_suite(): sys.exit(1) def cli_cmd_run_web_tests(conf): """Run web tests in a browser. Requires Firefox with Selenium.""" from invenio.testutils import build_and_run_web_test_suite if not build_and_run_web_test_suite(): sys.exit(1) def _detect_ip_address(): """Detect IP address of this computer. Useful for creating Apache vhost conf snippet on RHEL like machines. @return: IP address, or '*' if cannot detect @rtype: string @note: creates socket for real in order to detect real IP address, not the loopback one. """ try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('invenio-software.org', 0)) return s.getsockname()[0] except: return '*' def cli_cmd_create_apache_conf(conf): """ Create Apache conf files for this site, keeping previous files in a backup copy. """ print ">>> Going to create Apache conf files..." from invenio.textutils import wrap_text_in_a_box from invenio.access_control_config import CFG_EXTERNAL_AUTH_USING_SSO apache_conf_dir = conf.get("Invenio", 'CFG_ETCDIR') + \ os.sep + 'apache' ## Preparation of XSendFile directive xsendfile_directive_needed = int(conf.get("Invenio", 'CFG_BIBDOCFILE_USE_XSENDFILE')) != 0 if xsendfile_directive_needed: xsendfile_directive = "XSendFile On\n" else: xsendfile_directive = "#XSendFile On\n" for path in (conf.get('Invenio', 'CFG_BIBDOCFILE_FILEDIR'), # BibDocFile conf.get('Invenio', 'CFG_WEBDIR'), conf.get('Invenio', 'CFG_WEBSUBMIT_STORAGEDIR'), # WebSubmit conf.get('Invenio', 'CFG_TMPDIR'), os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'tmp', 'attachfile'), os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'data', 'comments'), os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'data', 'baskets', 'comments'), '/tmp'): # BibExport if xsendfile_directive_needed: xsendfile_directive += ' XSendFilePath %s\n' % path else: xsendfile_directive += ' #XSendFilePath %s\n' % path xsendfile_directive = xsendfile_directive.strip() ## Preparation of deflate directive deflate_directive_needed = int(conf.get("Invenio", 'CFG_WEBSTYLE_HTTP_USE_COMPRESSION')) != 0 if deflate_directive_needed: deflate_directive = r""" ## Configuration snippet taken from: ## SetOutputFilter DEFLATE # Netscape 4.x has some problems... BrowserMatch ^Mozilla/4 gzip-only-text/html # Netscape 4.06-4.08 have some more problems BrowserMatch ^Mozilla/4\.0[678] no-gzip # MSIE masquerades as Netscape, but it is fine # BrowserMatch \bMSIE !no-gzip !gzip-only-text/html # NOTE: Due to a bug in mod_setenvif up to Apache 2.0.48 # the above regex won't work. You can use the following # workaround to get the desired effect: BrowserMatch \bMSI[E] !no-gzip !gzip-only-text/html # Don't compress images SetEnvIfNoCase Request_URI \ \.(?:gif|jpe?g|png)$ no-gzip dont-vary # Make sure proxies don't deliver the wrong content Header append Vary User-Agent env=!dont-vary """ else: deflate_directive = "" if CFG_EXTERNAL_AUTH_USING_SSO: shibboleth_directive = r""" SSLRequireSSL # The modules only work using HTTPS AuthType shibboleth ShibRequireSession On ShibRequireAll On ShibExportAssertion Off require valid-user """ else: shibboleth_directive = "" ## Apache vhost conf file is distro specific, so analyze needs: # Gentoo (and generic defaults): listen_directive_needed = True ssl_pem_directive_needed = False ssl_pem_path = '/etc/apache2/ssl/apache.pem' ssl_crt_path = '/etc/apache2/ssl/server.crt' ssl_key_path = '/etc/apache2/ssl/server.key' vhost_ip_address_needed = False wsgi_socket_directive_needed = False # Debian: if os.path.exists(os.path.sep + 'etc' + os.path.sep + 'debian_version'): listen_directive_needed = False ssl_pem_directive_needed = True # RHEL/SLC: if os.path.exists(os.path.sep + 'etc' + os.path.sep + 'redhat-release'): listen_directive_needed = False ssl_crt_path = '/etc/pki/tls/certs/localhost.crt' ssl_key_path = '/etc/pki/tls/private/localhost.key' vhost_ip_address_needed = True wsgi_socket_directive_needed = True # maybe we are using non-standard ports? vhost_site_url = conf.get('Invenio', 'CFG_SITE_URL').replace("http://", "") if vhost_site_url.startswith("https://"): ## The installation is configured to require HTTPS for any connection vhost_site_url = vhost_site_url.replace("https://", "") vhost_site_url_port = '80' vhost_site_secure_url = conf.get('Invenio', 'CFG_SITE_SECURE_URL').replace("https://", "") vhost_site_secure_url_port = '443' if ':' in vhost_site_url: vhost_site_url, vhost_site_url_port = vhost_site_url.split(':', 1) if ':' in vhost_site_secure_url: vhost_site_secure_url, vhost_site_secure_url_port = vhost_site_secure_url.split(':', 1) if vhost_site_url_port != '80' or vhost_site_secure_url_port != '443': listen_directive_needed = True ## OK, let's create Apache vhost files: if not os.path.exists(apache_conf_dir): os.mkdir(apache_conf_dir) apache_vhost_file = apache_conf_dir + os.sep + \ 'invenio-apache-vhost.conf' apache_vhost_ssl_file = apache_conf_dir + os.sep + \ 'invenio-apache-vhost-ssl.conf' apache_vhost_body = """\ AddDefaultCharset UTF-8 ServerSignature Off ServerTokens Prod NameVirtualHost %(vhost_ip_address)s:%(vhost_site_url_port)s %(listen_directive)s %(wsgi_socket_directive)s WSGIRestrictStdout Off deny from all deny from all ServerName %(servername)s ServerAlias %(serveralias)s ServerAdmin %(serveradmin)s DocumentRoot %(webdir)s Options FollowSymLinks MultiViews AllowOverride None Order allow,deny Allow from all ErrorLog %(logdir)s/apache.err LogLevel warn LogFormat "%%h %%l %%u %%t \\"%%r\\" %%>s %%b \\"%%{Referer}i\\" \\"%%{User-agent}i\\" %%D" combined_with_timing CustomLog %(logdir)s/apache.log combined_with_timing DirectoryIndex index.en.html index.html Alias /static/ %(webdir)s/static/ Alias /img/ %(webdir)s/img/ Alias /css/ %(webdir)s/css/ Alias /js/ %(webdir)s/js/ Alias /flash/ %(webdir)s/flash/ Alias /export/ %(webdir)s/export/ Alias /MathJax/ %(webdir)s/MathJax/ Alias /jsCalendar/ %(webdir)s/jsCalendar/ Alias /ckeditor/ %(webdir)s/ckeditor/ Alias /mediaelement/ %(webdir)s/mediaelement/ AliasMatch /sitemap-(.*) %(webdir)s/sitemap-$1 Alias /robots.txt %(webdir)s/robots.txt Alias /favicon.ico %(webdir)s/favicon.ico WSGIDaemonProcess invenio processes=5 threads=1 display-name=%%{GROUP} inactivity-timeout=3600 maximum-requests=10000 WSGIImportScript %(wsgidir)s/invenio.wsgi process-group=invenio application-group=%%{GLOBAL} WSGIScriptAlias / %(wsgidir)s/invenio.wsgi WSGIPassAuthorization On %(xsendfile_directive)s WSGIProcessGroup invenio WSGIApplicationGroup %%{GLOBAL} Options FollowSymLinks MultiViews AllowOverride None Order allow,deny Allow from all %(deflate_directive)s """ % {'vhost_site_url_port': vhost_site_url_port, 'servername': vhost_site_url, 'serveralias': vhost_site_url.split('.')[0], 'serveradmin': conf.get('Invenio', 'CFG_SITE_ADMIN_EMAIL'), 'webdir': conf.get('Invenio', 'CFG_WEBDIR'), 'logdir': conf.get('Invenio', 'CFG_LOGDIR'), 'libdir' : conf.get('Invenio', 'CFG_PYLIBDIR'), 'wsgidir': os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi'), 'vhost_ip_address': vhost_ip_address_needed and _detect_ip_address() or '*', 'listen_directive': listen_directive_needed and 'Listen ' + vhost_site_url_port or \ '#Listen ' + vhost_site_url_port, 'wsgi_socket_directive': (wsgi_socket_directive_needed and \ 'WSGISocketPrefix ' or '#WSGISocketPrefix ') + \ conf.get('Invenio', 'CFG_PREFIX') + os.sep + 'var' + os.sep + 'run', 'xsendfile_directive' : xsendfile_directive, 'deflate_directive': deflate_directive, } apache_vhost_ssl_body = """\ ServerSignature Off ServerTokens Prod %(listen_directive)s NameVirtualHost %(vhost_ip_address)s:%(vhost_site_secure_url_port)s %(ssl_pem_directive)s %(ssl_crt_directive)s %(ssl_key_directive)s WSGIRestrictStdout Off deny from all deny from all ServerName %(servername)s ServerAlias %(serveralias)s ServerAdmin %(serveradmin)s SSLEngine on DocumentRoot %(webdir)s Options FollowSymLinks MultiViews AllowOverride None Order allow,deny Allow from all ErrorLog %(logdir)s/apache-ssl.err LogLevel warn LogFormat "%%h %%l %%u %%t \\"%%r\\" %%>s %%b \\"%%{Referer}i\\" \\"%%{User-agent}i\\" %%D" combined_with_timing CustomLog %(logdir)s/apache-ssl.log combined_with_timing DirectoryIndex index.en.html index.html Alias /static/ %(webdir)s/static/ Alias /img/ %(webdir)s/img/ Alias /css/ %(webdir)s/css/ Alias /js/ %(webdir)s/js/ Alias /flash/ %(webdir)s/flash/ Alias /export/ %(webdir)s/export/ Alias /MathJax/ %(webdir)s/MathJax/ Alias /jsCalendar/ %(webdir)s/jsCalendar/ Alias /ckeditor/ %(webdir)s/ckeditor/ Alias /mediaelement/ %(webdir)s/mediaelement/ AliasMatch /sitemap-(.*) %(webdir)s/sitemap-$1 Alias /robots.txt %(webdir)s/robots.txt Alias /favicon.ico %(webdir)s/favicon.ico RedirectMatch /sslredirect/(.*) http://$1 WSGIScriptAlias / %(wsgidir)s/invenio.wsgi WSGIPassAuthorization On %(xsendfile_directive)s WSGIProcessGroup invenio WSGIApplicationGroup %%{GLOBAL} Options FollowSymLinks MultiViews AllowOverride None Order allow,deny Allow from all %(deflate_directive)s %(shibboleth_directive)s """ % {'vhost_site_secure_url_port': vhost_site_secure_url_port, 'servername': vhost_site_secure_url, 'serveralias': vhost_site_secure_url.split('.')[0], 'serveradmin': conf.get('Invenio', 'CFG_SITE_ADMIN_EMAIL'), 'webdir': conf.get('Invenio', 'CFG_WEBDIR'), 'logdir': conf.get('Invenio', 'CFG_LOGDIR'), 'libdir' : conf.get('Invenio', 'CFG_PYLIBDIR'), 'wsgidir' : os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi'), 'vhost_ip_address': vhost_ip_address_needed and _detect_ip_address() or '*', 'listen_directive' : listen_directive_needed and 'Listen ' + vhost_site_secure_url_port or \ '#Listen ' + vhost_site_secure_url_port, 'ssl_pem_directive': ssl_pem_directive_needed and \ 'SSLCertificateFile %s' % ssl_pem_path or \ '#SSLCertificateFile %s' % ssl_pem_path, 'ssl_crt_directive': ssl_pem_directive_needed and \ '#SSLCertificateFile %s' % ssl_crt_path or \ 'SSLCertificateFile %s' % ssl_crt_path, 'ssl_key_directive': ssl_pem_directive_needed and \ '#SSLCertificateKeyFile %s' % ssl_key_path or \ 'SSLCertificateKeyFile %s' % ssl_key_path, 'xsendfile_directive' : xsendfile_directive, 'deflate_directive': deflate_directive, 'shibboleth_directive': shibboleth_directive, } # write HTTP vhost snippet: if os.path.exists(apache_vhost_file): shutil.copy(apache_vhost_file, apache_vhost_file + '.OLD') fdesc = open(apache_vhost_file, 'w') fdesc.write(apache_vhost_body) fdesc.close() print print "Created file", apache_vhost_file # write HTTPS vhost snippet: vhost_ssl_created = False if conf.get('Invenio', 'CFG_SITE_SECURE_URL').startswith("https://"): if os.path.exists(apache_vhost_ssl_file): shutil.copy(apache_vhost_ssl_file, apache_vhost_ssl_file + '.OLD') fdesc = open(apache_vhost_ssl_file, 'w') fdesc.write(apache_vhost_ssl_body) fdesc.close() vhost_ssl_created = True print "Created file", apache_vhost_ssl_file print wrap_text_in_a_box("""\ Apache virtual host configuration file(s) for your Invenio site was(were) created. Please check created file(s) and activate virtual host(s). For example, you can put the following include statements in your httpd.conf:\n Include %s %s Please see the INSTALL file for more details. """ % (apache_vhost_file, (vhost_ssl_created and 'Include ' or '#Include ') + apache_vhost_ssl_file)) print ">>> Apache conf files created." def cli_cmd_get(conf, varname): """ Return value of VARNAME read from CONF files. Useful for third-party programs to access values of conf options such as CFG_PREFIX. Return None if VARNAME is not found. """ try: if not varname: raise Exception("ERROR: Please specify a configuration variable.") varname = varname.lower() # do not pay attention to section names yet: all_options = {} for section in conf.sections(): for option in conf.options(section): all_options[option] = conf.get(section, option) varvalue = all_options.get(varname, None) if varvalue is None: raise Exception() print varvalue except Exception, e: if e.message: print e.message sys.exit(1) def cli_cmd_list(conf): """ Print a list of all conf options and values from CONF. """ sections = conf.sections() sections.sort() for section in sections: options = conf.options(section) options.sort() for option in options: print option.upper(), '=', conf.get(section, option) def _grep_version_from_executable(path_to_exec, version_regexp): """ Try to detect a program version by digging into its binary PATH_TO_EXEC and looking for VERSION_REGEXP. Return program version as a string. Return empty string if not succeeded. """ from invenio.shellutils import run_shell_command exec_version = "" if os.path.exists(path_to_exec): dummy1, cmd2_out, dummy2 = run_shell_command("strings %s | grep %s", (path_to_exec, version_regexp)) if cmd2_out: for cmd2_out_line in cmd2_out.split("\n"): if len(cmd2_out_line) > len(exec_version): # the longest the better exec_version = cmd2_out_line return exec_version def detect_apache_version(): """ Try to detect Apache version by localizing httpd or apache executables and grepping inside binaries. Return list of all found Apache versions and paths. (For a given executable, the returned format is 'apache_version [apache_path]'.) Return empty list if no success. """ from invenio.shellutils import run_shell_command out = [] dummy1, cmd_out, dummy2 = run_shell_command("locate bin/httpd bin/apache") for apache in cmd_out.split("\n"): apache_version = _grep_version_from_executable(apache, '^Apache\/') if apache_version: out.append("%s [%s]" % (apache_version, apache)) return out def cli_cmd_detect_system_details(conf): """ Detect and print system details such as Apache/Python/MySQL versions etc. Useful for debugging problems on various OS. """ import MySQLdb print ">>> Going to detect system details..." print "* Hostname: " + socket.gethostname() print "* Invenio version: " + conf.get("Invenio", "CFG_VERSION") print "* Python version: " + sys.version.replace("\n", " ") print "* Apache version: " + ";\n ".join(detect_apache_version()) print "* MySQLdb version: " + MySQLdb.__version__ try: from invenio.dbquery import run_sql print "* MySQL version:" for key, val in run_sql("SHOW VARIABLES LIKE 'version%'") + \ run_sql("SHOW VARIABLES LIKE 'charact%'") + \ run_sql("SHOW VARIABLES LIKE 'collat%'"): if False: print " - %s: %s" % (key, val) elif key in ['version', 'character_set_client', 'character_set_connection', 'character_set_database', 'character_set_results', 'character_set_server', 'character_set_system', 'collation_connection', 'collation_database', 'collation_server']: print " - %s: %s" % (key, val) except ImportError: print "* ERROR: cannot import dbquery" print ">>> System details detected successfully." def cli_cmd_upgrade(conf): """ Command for applying upgrades """ from invenio.inveniocfg_upgrader import cmd_upgrade cmd_upgrade(conf) def cli_cmd_upgrade_check(conf): """ Command for running pre-upgrade checks """ from invenio.inveniocfg_upgrader import cmd_upgrade_check cmd_upgrade_check(conf) def cli_cmd_upgrade_show_pending(conf): """ Command for showing upgrades ready to be applied """ from invenio.inveniocfg_upgrader import cmd_upgrade_show_pending cmd_upgrade_show_pending(conf) def cli_cmd_upgrade_show_applied(conf): """ Command for showing all upgrades already applied. """ from invenio.inveniocfg_upgrader import cmd_upgrade_show_applied cmd_upgrade_show_applied(conf) def cli_cmd_upgrade_create_release_recipe(conf, path): """ Create a new release upgrade recipe (for developers). """ from invenio.inveniocfg_upgrader import cmd_upgrade_create_release_recipe cmd_upgrade_create_release_recipe(conf, path) def cli_cmd_upgrade_create_standard_recipe(conf, path, depends_on=None, release=False): """ Create a new upgrade recipe (for developers). """ from invenio.inveniocfg_upgrader import cmd_upgrade_create_standard_recipe cmd_upgrade_create_standard_recipe(conf, path, depends_on=depends_on, release=release) def prepare_option_parser(): """Parse the command line options.""" class InvenioOption(Option): """ Option class that implements the action 'store_append_const' which will 1) append to list in options. 2) take a value and store in options. Useful for e.g. appending a const to an actions list, while also taking an option value and storing it. This ensures that we can run actions in the order they are given on the command-line. Python 2.4 compatibility note: *append_const* action is not available in Python 2.4, so it is implemented here, together with the new action *store_append_const*. """ ACTIONS = Option.ACTIONS + ("store_append_const", "append_const") STORE_ACTIONS = Option.STORE_ACTIONS + ("store_append_const", "append_const") TYPED_ACTIONS = Option.TYPED_ACTIONS + ("store_append_const", ) ALWAYS_TYPED_ACTIONS = Option.ALWAYS_TYPED_ACTIONS + ("store_append_const", ) CONST_ACTIONS = getattr(Option, 'CONST_ACTIONS', ()) + ("store_append_const", "append_const") def take_action(self, action, dest, opt, value, values, parser): if action == "store_append_const": # Combination of 'store' and 'append_const' actions values.ensure_value(dest, []).append(self.const) value_dest = self.const.replace('-', '_') setattr(values, value_dest, value) elif action == "append_const" and not hasattr(Option, 'CONST_ACTIONS'): values.ensure_value(dest, []).append(self.const) else: Option.take_action(self, action, dest, opt, value, values, parser) def _check_const(self): if self.action not in self.CONST_ACTIONS and self.const is not None: raise OptionError( "'const' must not be supplied for action %r" % self.action, self) CHECK_METHODS = [ Option._check_action, Option._check_type, Option._check_choice, Option._check_dest, _check_const, Option._check_nargs, Option._check_callback, ] parser = OptionParser(option_class=InvenioOption, description="Invenio configuration and administration CLI tool", formatter=IndentedHelpFormatter(max_help_position=31)) parser.add_option("-V", "--version", action="store_true", help="print version number") finish_options = OptionGroup(parser, "Options to finish your installation") finish_options.add_option("", "--create-apache-conf", dest='actions', const='create-apache-conf', action="append_const", help="create Apache configuration files") finish_options.add_option("", "--create-tables", dest='actions', const='create-tables', action="append_const", help="create DB tables for Invenio") finish_options.add_option("", "--load-bibfield-conf", dest='actions', const='load-bibfield-conf', action="append_const", help="load bibfield configuration file") finish_options.add_option("", "--load-webstat-conf", dest='actions', const='load-webstat-conf', action="append_const", help="load the WebStat configuration") finish_options.add_option("", "--drop-tables", dest='actions', const='drop-tables', action="append_const", help="drop DB tables of Invenio") finish_options.add_option("", "--check-openoffice", dest='actions', const='check-openoffice', action="append_const", help="check for correctly set up of openoffice temporary directory") parser.add_option_group(finish_options) demotest_options = OptionGroup(parser, "Options to set up and test a demo site") demotest_options.add_option("", "--create-demo-site", dest='actions', const='create-demo-site', action="append_const", help="create demo site") demotest_options.add_option("", "--load-demo-records", dest='actions', const='load-demo-records', action="append_const", help="load demo records") demotest_options.add_option("", "--remove-demo-records", dest='actions', const='remove-demo-records', action="append_const", help="remove demo records, keeping demo site") demotest_options.add_option("", "--drop-demo-site", dest='actions', const='drop-demo-site', action="append_const", help="drop demo site configurations too") demotest_options.add_option("", "--run-unit-tests", dest='actions', const='run-unit-tests', action="append_const", help="run unit test suite (needs demo site)") demotest_options.add_option("", "--run-js-unit-tests", dest='actions', const='run-js-unit-tests', action="append_const", help="run JS unit test suite (needs demo site)") demotest_options.add_option("", "--run-regression-tests", dest='actions', const='run-regression-tests', action="append_const", help="run regression test suite (needs demo site)") demotest_options.add_option("", "--run-web-tests", dest='actions', const='run-web-tests', action="append_const", help="run web tests in a browser (needs demo site, Firefox, Selenium IDE)") parser.add_option_group(demotest_options) config_options = OptionGroup(parser, "Options to update config files in situ") config_options.add_option("", "--update-all", dest='actions', const='update-all', action="append_const", help="perform all the update options") config_options.add_option("", "--update-config-py", dest='actions', const='update-config-py', action="append_const", help="update config.py file from invenio.conf file") config_options.add_option("", "--update-dbquery-py", dest='actions', const='update-dbquery-py', action="append_const", help="update dbquery.py with DB credentials from invenio.conf") config_options.add_option("", "--update-dbexec", dest='actions', const='update-dbexec', action="append_const", help="update dbexec with DB credentials from invenio.conf") config_options.add_option("", "--update-bibconvert-tpl", dest='actions', const='update-bibconvert-tpl', action="append_const", help="update bibconvert templates with CFG_SITE_URL from invenio.conf") config_options.add_option("", "--update-web-tests", dest='actions', const='update-web-tests', action="append_const", help="update web test cases with CFG_SITE_URL from invenio.conf") parser.add_option_group(config_options) reset_options = OptionGroup(parser, "Options to update DB tables") reset_options.add_option("", "--reset-all", dest='actions', const='reset-all', action="append_const", help="perform all the reset options") reset_options.add_option("", "--reset-sitename", dest='actions', const='reset-sitename', action="append_const", help="reset tables to take account of new CFG_SITE_NAME*") reset_options.add_option("", "--reset-siteadminemail", dest='actions', const='reset-siteadminemail', action="append_const", help="reset tables to take account of new CFG_SITE_ADMIN_EMAIL") reset_options.add_option("", "--reset-fieldnames", dest='actions', const='reset-fieldnames', action="append_const", help="reset tables to take account of new I18N names from PO files") reset_options.add_option("", "--reset-recstruct-cache", dest='actions', const='reset-recstruct-cache', action="append_const", help="reset record structure cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE") reset_options.add_option("", "--reset-recjson-cache", dest='actions', const='reset-recjson-cache', action="append_const", help="reset record json structure cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE") parser.add_option_group(reset_options) upgrade_options = OptionGroup(parser, "Options to upgrade your installation") upgrade_options.add_option("", "--upgrade", dest='actions', const='upgrade', action="append_const", help="apply all pending upgrades") upgrade_options.add_option("", "--upgrade-check", dest='actions', const='upgrade-check', action="append_const", help="run pre-upgrade checks for pending upgrades") upgrade_options.add_option("", "--upgrade-show-pending", dest='actions', const='upgrade-show-pending', action="append_const", help="show pending upgrades") upgrade_options.add_option("", "--upgrade-show-applied", dest='actions', const='upgrade-show-applied', action="append_const", help="show history of applied upgrades") upgrade_options.add_option("", "--upgrade-create-standard-recipe", dest='actions', metavar='REPOSITORY[,DIR]', const='upgrade-create-standard-recipe', action="store_append_const", help="create a new standard upgrade recipe (for developers)") upgrade_options.add_option("", "--upgrade-create-release-recipe", dest='actions', metavar='REPOSITORY[,DIR]', const='upgrade-create-release-recipe', action="store_append_const", help="create a new release upgrade recipe (for developers)") parser.add_option_group(upgrade_options) helper_options = OptionGroup(parser, "Options to help the work") helper_options.add_option("", "--list", dest='actions', const='list', action="append_const", help="print names and values of all options from conf files") helper_options.add_option("", "--get", dest='actions', const='get', action="store_append_const", metavar="OPTION", help="get value of a given option from conf files") helper_options.add_option("", "--conf-dir", action="store", metavar="PATH", help="path to directory where invenio*.conf files are [optional]") helper_options.add_option("", "--detect-system-details", dest='actions', const='detect-system-details', action="append_const", help="print system details such as Apache/Python/MySQL versions") parser.add_option_group(helper_options) parser.add_option('--yes-i-know', action='store_true', dest='yes-i-know', help='use with care!') return parser def prepare_conf(options): """ Read configuration files """ conf = ConfigParser() confdir = getattr(options, 'conf_dir', None) if confdir is None: ## try to detect path to conf dir (relative to this bin dir): confdir = re.sub(r'/bin$', '/etc', sys.path[0]) if confdir and not os.path.exists(confdir): raise Exception("ERROR: bad --conf-dir option value - directory does not exists.") sys.exit(1) ## read conf files: for conffile in [confdir + os.sep + 'invenio.conf', confdir + os.sep + 'invenio-autotools.conf', confdir + os.sep + 'invenio-local.conf', ]: if os.path.exists(conffile): conf.read(conffile) else: if not conffile.endswith("invenio-local.conf"): # invenio-local.conf is optional, otherwise stop raise Exception("ERROR: Badly guessed conf file location %s (Please use --conf-dir option.)" % conffile) return conf def main(*cmd_args): """Main entry point.""" # Allow easier testing if not cmd_args: cmd_args = sys.argv[1:] # Parse arguments parser = prepare_option_parser() (options, dummy_args) = parser.parse_args(list(cmd_args)) if getattr(options, 'version', False): print_version() else: # Read configuration try: conf = prepare_conf(options) except Exception, e: print e sys.exit(1) ## Decide what to do actions = getattr(options, 'actions', None) if not actions: print """ERROR: Please specify a command. Please see '--help'.""" sys.exit(1) for action in actions: if action == 'get': cli_cmd_get(conf, getattr(options, 'get', None)) elif action == 'list': cli_cmd_list(conf) elif action == 'detect-system-details': cli_cmd_detect_system_details(conf) elif action == 'create-tables': cli_cmd_create_tables(conf) elif action == 'load-webstat-conf': cli_cmd_load_webstat_conf(conf) elif action == 'drop-tables': cli_cmd_drop_tables(conf) elif action == 'check-openoffice': cli_check_openoffice(conf) elif action == 'load-bibfield-conf': cli_cmd_load_bibfield_config(conf) elif action == 'create-demo-site': cli_cmd_create_demo_site(conf) elif action == 'load-demo-records': cli_cmd_load_demo_records(conf) elif action == 'remove-demo-records': cli_cmd_remove_demo_records(conf) elif action == 'drop-demo-site': cli_cmd_drop_demo_site(conf) elif action == 'run-unit-tests': cli_cmd_run_unit_tests(conf) elif action == 'run-js-unit-tests': cli_cmd_run_js_unit_tests(conf) elif action == 'run-regression-tests': cli_cmd_run_regression_tests(conf) elif action == 'run-web-tests': cli_cmd_run_web_tests(conf) elif action == 'update-all': cli_cmd_update_config_py(conf) cli_cmd_update_dbquery_py(conf) cli_cmd_update_dbexec(conf) cli_cmd_update_bibconvert_tpl(conf) cli_cmd_update_web_tests(conf) elif action == 'update-config-py': cli_cmd_update_config_py(conf) elif action == 'update-dbquery-py': cli_cmd_update_dbquery_py(conf) elif action == 'update-dbexec': cli_cmd_update_dbexec(conf) elif action == 'update-bibconvert-tpl': cli_cmd_update_bibconvert_tpl(conf) elif action == 'update-web-tests': cli_cmd_update_web_tests(conf) elif action == 'reset-all': cli_cmd_reset_sitename(conf) cli_cmd_reset_siteadminemail(conf) cli_cmd_reset_fieldnames(conf) cli_cmd_reset_recstruct_cache(conf) elif action == 'reset-sitename': cli_cmd_reset_sitename(conf) elif action == 'reset-siteadminemail': cli_cmd_reset_siteadminemail(conf) elif action == 'reset-fieldnames': cli_cmd_reset_fieldnames(conf) elif action == 'reset-recstruct-cache': cli_cmd_reset_recstruct_cache(conf) elif action == 'reset-recjson-cache': cli_cmd_reset_recjson_cache(conf) elif action == 'create-apache-conf': cli_cmd_create_apache_conf(conf) elif action == 'upgrade': cli_cmd_upgrade(conf) elif action == 'upgrade-check': cli_cmd_upgrade_check(conf) elif action == 'upgrade-show-pending': cli_cmd_upgrade_show_pending(conf) elif action == 'upgrade-show-applied': cli_cmd_upgrade_show_applied(conf) elif action == 'upgrade-create-standard-recipe': cli_cmd_upgrade_create_standard_recipe(conf, getattr(options, 'upgrade_create_standard_recipe', None)) elif action == 'upgrade-create-release-recipe': cli_cmd_upgrade_create_release_recipe(conf, getattr(options, 'upgrade_create_release_recipe', None)) else: print "ERROR: Unknown command", action sys.exit(1) if __name__ == '__main__': main()