diff --git a/config/invenio.conf b/config/invenio.conf index 59714ef84..269d6079b 100644 --- a/config/invenio.conf +++ b/config/invenio.conf @@ -1,1491 +1,1494 @@ ## This file is part of Invenio. ## Copyright (C) 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ################################################### ## About 'invenio.conf' and 'invenio-local.conf' ## ################################################### ## The 'invenio.conf' file contains the vanilla default configuration ## parameters of a Invenio installation, as coming out of the ## distribution. The file should be self-explanatory. Once installed ## in its usual location (usually /opt/invenio/etc), you could in ## principle go ahead and change the values according to your local ## needs, but this is not advised. ## ## If you would like to customize some of these parameters, you should ## rather create a file named 'invenio-local.conf' in the same ## directory where 'invenio.conf' lives and you should write there ## only the customizations that you want to be different from the ## vanilla defaults. ## ## Here is a realistic, minimalist, yet production-ready example of ## what you would typically put there: ## ## $ cat /opt/invenio/etc/invenio-local.conf ## [Invenio] ## CFG_SITE_NAME = John Doe's Document Server ## CFG_SITE_NAME_INTL_fr = Serveur des Documents de John Doe ## CFG_SITE_URL = http://your.site.com ## CFG_SITE_SECURE_URL = https://your.site.com ## CFG_SITE_ADMIN_EMAIL = john.doe@your.site.com ## CFG_SITE_SUPPORT_EMAIL = john.doe@your.site.com ## CFG_WEBALERT_ALERT_ENGINE_EMAIL = john.doe@your.site.com ## CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = john.doe@your.site.com ## CFG_WEBCOMMENT_DEFAULT_MODERATOR = john.doe@your.site.com ## CFG_DATABASE_HOST = localhost ## CFG_DATABASE_NAME = invenio ## CFG_DATABASE_USER = invenio ## CFG_DATABASE_PASS = my123p$ss ## ## You should override at least the parameters mentioned above and the ## parameters mentioned in the `Part 1: Essential parameters' below in ## order to define some very essential runtime parameters such as the ## name of your document server (CFG_SITE_NAME and ## CFG_SITE_NAME_INTL_*), the visible URL of your document server ## (CFG_SITE_URL and CFG_SITE_SECURE_URL), the email address of the ## local Invenio administrator, comment moderator, and alert engine ## (CFG_SITE_SUPPORT_EMAIL, CFG_SITE_ADMIN_EMAIL, etc), and last but ## not least your database credentials (CFG_DATABASE_*). ## ## The Invenio system will then read both the default invenio.conf ## file and your customized invenio-local.conf file and it will ## override any default options with the ones you have specified in ## your local file. This cascading of configuration parameters will ## ease your future upgrades. [Invenio] ################################### ## Part 1: Essential parameters ## ################################### ## This part defines essential Invenio internal parameters that ## everybody should override, like the name of the server or the email ## address of the local Invenio administrator. ## CFG_DATABASE_* - specify which MySQL server to use, the name of the ## database to use, and the database access credentials. CFG_DATABASE_HOST = localhost CFG_DATABASE_PORT = 3306 CFG_DATABASE_NAME = invenio CFG_DATABASE_USER = invenio CFG_DATABASE_PASS = my123p$ss ## CFG_SITE_URL - specify URL under which your installation will be ## visible. For example, use "http://your.site.com". Do not leave ## trailing slash. CFG_SITE_URL = http://localhost ## CFG_SITE_SECURE_URL - specify secure URL under which your ## installation secure pages such as login or registration will be ## visible. For example, use "https://your.site.com". Do not leave ## trailing slash. If you don't plan on using HTTPS, then you may ## leave this empty. CFG_SITE_SECURE_URL = https://localhost ## CFG_SITE_NAME -- the visible name of your Invenio installation. CFG_SITE_NAME = Atlantis Institute of Fictive Science ## CFG_SITE_NAME_INTL -- the international versions of CFG_SITE_NAME ## in various languages. (See also CFG_SITE_LANGS below.) CFG_SITE_NAME_INTL_en = Atlantis Institute of Fictive Science CFG_SITE_NAME_INTL_fr = Atlantis Institut des Sciences Fictives CFG_SITE_NAME_INTL_de = Atlantis Institut der fiktiven Wissenschaft CFG_SITE_NAME_INTL_es = Atlantis Instituto de la Ciencia Fictive CFG_SITE_NAME_INTL_ca = Institut Atlantis de Ciència Fictícia CFG_SITE_NAME_INTL_pt = Instituto Atlantis de Ciência Fictícia CFG_SITE_NAME_INTL_it = Atlantis Istituto di Scienza Fittizia CFG_SITE_NAME_INTL_ru = Институт Фиктивных Наук Атлантиды CFG_SITE_NAME_INTL_sk = Atlantis Inštitút Fiktívnych Vied CFG_SITE_NAME_INTL_cs = Atlantis Institut Fiktivních Věd CFG_SITE_NAME_INTL_no = Atlantis Institutt for Fiktiv Vitenskap CFG_SITE_NAME_INTL_sv = Atlantis Institut för Fiktiv Vetenskap CFG_SITE_NAME_INTL_el = Ινστιτούτο Φανταστικών Επιστημών Ατλαντίδος CFG_SITE_NAME_INTL_uk = Інститут вигаданих наук в Атлантісі CFG_SITE_NAME_INTL_ja = Fictive 科学のAtlantis の協会 CFG_SITE_NAME_INTL_pl = Instytut Fikcyjnej Nauki Atlantis CFG_SITE_NAME_INTL_bg = Институт за фиктивни науки Атлантис CFG_SITE_NAME_INTL_hr = Institut Fiktivnih Znanosti Atlantis CFG_SITE_NAME_INTL_zh_CN = 阿特兰提斯虚拟科学学院 CFG_SITE_NAME_INTL_zh_TW = 阿特蘭提斯虛擬科學學院 CFG_SITE_NAME_INTL_hu = Kitalált Tudományok Atlantiszi Intézete CFG_SITE_NAME_INTL_af = Atlantis Instituut van Fiktiewe Wetenskap CFG_SITE_NAME_INTL_gl = Instituto Atlantis de Ciencia Fictive CFG_SITE_NAME_INTL_ro = Institutul Atlantis al Ştiinţelor Fictive CFG_SITE_NAME_INTL_rw = Atlantis Ishuri Rikuru Ry'ubuhanga CFG_SITE_NAME_INTL_ka = ატლანტიდის ფიქტიური მეცნიერების ინსტიტუტი CFG_SITE_NAME_INTL_lt = Fiktyvių Mokslų Institutas Atlantis CFG_SITE_NAME_INTL_ar = معهد أطلنطيس للعلوم الافتراضية ## CFG_SITE_LANG -- the default language of the interface: ' CFG_SITE_LANG = en ## CFG_SITE_LANGS -- list of all languages the user interface should ## be available in, separated by commas. The order specified below ## will be respected on the interface pages. A good default would be ## to use the alphabetical order. Currently supported languages ## include Afrikaans, Arabic, Bulgarian, Catalan, Czech, German, Georgian, ## Greek, English, Spanish, French, Croatian, Hungarian, Galician, ## Italian, Japanese, Kinyarwanda, Lithuanian, Norwegian, Polish, ## Portuguese, Romanian, Russian, Slovak, Swedish, Ukrainian, Chinese ## (China), Chinese (Taiwan), so that the eventual maximum you can ## currently select is ## "af,ar,bg,ca,cs,de,el,en,es,fr,hr,gl,ka,it,rw,lt,hu,ja,no,pl,pt,ro,ru,sk,sv,uk,zh_CN,zh_TW". CFG_SITE_LANGS = af,ar,bg,ca,cs,de,el,en,es,fr,hr,gl,ka,it,rw,lt,hu,ja,no,pl,pt,ro,ru,sk,sv,uk,zh_CN,zh_TW ## CFG_SITE_SUPPORT_EMAIL -- the email address of the support team for ## this installation: CFG_SITE_SUPPORT_EMAIL = info@invenio-software.org ## CFG_SITE_ADMIN_EMAIL -- the email address of the 'superuser' for ## this installation. Enter your email address below and login with ## this address when using Invenio inistration modules. You ## will then be automatically recognized as superuser of the system. CFG_SITE_ADMIN_EMAIL = info@invenio-software.org ## CFG_SITE_EMERGENCY_EMAIL_ADDRESSES -- list of email addresses to ## which an email should be sent in case of emergency (e.g. bibsched ## queue has been stopped because of an error). Configuration ## dictionary allows for different recipients based on weekday and ## time-of-day. Example: ## ## CFG_SITE_EMERGENCY_EMAIL_ADDRESSES = { ## 'Sunday 22:00-06:00': '0041761111111@email2sms.foo.com', ## '06:00-18:00': 'team-in-europe@foo.com,0041762222222@email2sms.foo.com', ## '18:00-06:00': 'team-in-usa@foo.com', ## '*': 'john.doe.phone@foo.com'} ## ## If you want the emergency email notifications to always go to the ## same address, just use the wildcard line in the above example. CFG_SITE_EMERGENCY_EMAIL_ADDRESSES = {} ## CFG_SITE_ADMIN_EMAIL_EXCEPTIONS -- set this to 0 if you do not want ## to receive any captured exception via email to CFG_SITE_ADMIN_EMAIL ## address. Captured exceptions will still be available in ## var/log/invenio.err file. Set this to 1 if you want to receive ## some of the captured exceptions (this depends on the actual place ## where the exception is captured). Set this to 2 if you want to ## receive all captured exceptions. CFG_SITE_ADMIN_EMAIL_EXCEPTIONS = 1 ## CFG_SITE_RECORD -- what is the URI part representing detailed ## record pages? We recomment to leave the default value `record' ## unchanged. CFG_SITE_RECORD = record ## CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER -- set this to ## the number of seconds after which to reset the exception notification ## counter. A given repetitive exception is notified via email with a ## logarithmic strategy: the first time it is seen it is sent via email, ## then the second time, then the fourth, then the eighth and so forth. ## If the number of seconds elapsed since the last time it was notified ## is greater than CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER ## then the internal counter is reset in order not to have exception ## notification become more and more rare. CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER = 14400 ## CFG_CERN_SITE -- do we want to enable CERN-specific code? ## Put "1" for "yes" and "0" for "no". CFG_CERN_SITE = 0 ## CFG_INSPIRE_SITE -- do we want to enable INSPIRE-specific code? ## Put "1" for "yes" and "0" for "no". CFG_INSPIRE_SITE = 0 ## CFG_ADS_SITE -- do we want to enable ADS-specific code? ## Put "1" for "yes" and "0" for "no". CFG_ADS_SITE = 0 ## CFG_OPENAIRE_SITE -- do we want to enable OpenAIRE-specific code? ## Put "1" for "yes" and "0" for "no". CFG_OPENAIRE_SITE = 0 ## CFG_DEVEL_SITE -- is this a development site? If it is, you might ## prefer that it does not do certain things. For example, you might ## not want WebSubmit to send certain emails or trigger certain ## processes on a development site. ## Put "1" for "yes" (this is a development site) or "0" for "no" ## (this isn't a development site.) CFG_DEVEL_SITE = 0 ################################ ## Part 2: Web page style ## ################################ ## The variables affecting the page style. The most important one is ## the 'template skin' you would like to use and the obfuscation mode ## for your email addresses. Please refer to the WebStyle Admin Guide ## for more explanation. The other variables are listed here mostly ## for backwards compatibility purposes only. ## CFG_WEBSTYLE_TEMPLATE_SKIN -- what template skin do you want to ## use? CFG_WEBSTYLE_TEMPLATE_SKIN = default ## CFG_WEBSTYLE_EMAIL_ADDRESSES_OBFUSCATION_MODE. How do we "protect" ## email addresses from undesired automated email harvesters? This ## setting will not affect 'support' and 'admin' emails. ## NOTE: there is no ultimate solution to protect against email ## harvesting. All have drawbacks and can more or less be ## circumvented. Choose you preferred mode ([t] means "transparent" ## for the user): ## -1: hide all emails. ## [t] 0 : no protection, email returned as is. ## foo@example.com => foo@example.com ## 1 : basic email munging: replaces @ by [at] and . by [dot] ## foo@example.com => foo [at] example [dot] com ## [t] 2 : transparent name mangling: characters are replaced by ## equivalent HTML entities. ## foo@example.com => foo@example.com ## [t] 3 : javascript insertion. Requires Javascript enabled on client ## side. ## 4 : replaces @ and . characters by gif equivalents. ## foo@example.com => foo [at] example [dot] com CFG_WEBSTYLE_EMAIL_ADDRESSES_OBFUSCATION_MODE = 2 ## CFG_WEBSTYLE_INSPECT_TEMPLATES -- Do we want to debug all template ## functions so that they would return HTML results wrapped in ## comments indicating which part of HTML page was created by which ## template function? Useful only for debugging Pythonic HTML ## template. See WebStyle Admin Guide for more information. CFG_WEBSTYLE_INSPECT_TEMPLATES = 0 ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXLEFTTOP -- eventual global HTML ## left top box: CFG_WEBSTYLE_CDSPAGEBOXLEFTTOP = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXLEFTBOTTOM -- eventual global ## HTML left bottom box: CFG_WEBSTYLE_CDSPAGEBOXLEFTBOTTOM = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXRIGHTTOP -- eventual global ## HTML right top box: CFG_WEBSTYLE_CDSPAGEBOXRIGHTTOP = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXRIGHTBOTTOM -- eventual global ## HTML right bottom box: CFG_WEBSTYLE_CDSPAGEBOXRIGHTBOTTOM = ## CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST -- when certain HTTP status ## codes are raised to the WSGI handler, the corresponding exceptions ## and error messages can be sent to the system administrator for ## inspecting. This is useful to detect and correct errors. The ## variable represents a comma-separated list of HTTP statuses that ## should alert admin. Wildcards are possible. If the status is ## followed by an "r", it means that a referer is required to exist ## (useful to distinguish broken known links from URL typos when 404 ## errors are raised). CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST = 404r,400,5*,41* ## CFG_WEBSTYLE_HTTP_USE_COMPRESSION -- whether to enable deflate ## compression of your HTTP/HTTPS connections. This will affect the Apache ## configuration snippets created by inveniocfg --create-apache-conf and ## the OAI-PMH Identify response. CFG_WEBSTYLE_HTTP_USE_COMPRESSION = 0 ################################## ## Part 3: WebSearch parameters ## ################################## ## This section contains some configuration parameters for WebSearch ## module. Please note that WebSearch is mostly configured on ## run-time via its WebSearch Admin web interface. The parameters ## below are the ones that you do not probably want to modify very ## often during the runtime. (Note that you may modify them ## afterwards too, though.) ## CFG_WEBSEARCH_SEARCH_CACHE_SIZE -- how many queries we want to ## cache in memory per one Apache httpd process? This cache is used ## mainly for "next/previous page" functionality, but it caches also ## "popular" user queries if more than one user happen to search for ## the same thing. Note that large numbers may lead to great memory ## consumption. We recommend a value not greater than 100. CFG_WEBSEARCH_SEARCH_CACHE_SIZE = 0 ## CFG_WEBSEARCH_FIELDS_CONVERT -- if you migrate from an older ## system, you may want to map field codes of your old system (such as ## 'ti') to Invenio/MySQL ("title"). Use Python dictionary syntax ## for the translation table, e.g. {'wau':'author', 'wti':'title'}. ## Usually you don't want to do that, and you would use empty dict {}. CFG_WEBSEARCH_FIELDS_CONVERT = {} ## CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH -- width of the ## search pattern window in the light search interface, in ## characters. CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH = 60 CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH = 60 ## CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH -- width of the search ## pattern window in the simple search interface, in characters. CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH = 40 ## CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH -- width of the ## search pattern window in the advanced search interface, in ## characters. CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH = 30 ## CFG_WEBSEARCH_NB_RECORDS_TO_SORT -- how many records do we still ## want to sort? For higher numbers we print only a warning and won't ## perform any sorting other than default 'latest records first', as ## sorting would be very time consuming then. We recommend a value of ## not more than a couple of thousands. CFG_WEBSEARCH_NB_RECORDS_TO_SORT = 1000 ## CFG_WEBSEARCH_CALL_BIBFORMAT -- if a record is being displayed but ## it was not preformatted in the "HTML brief" format, do we want to ## call BibFormatting on the fly? Put "1" for "yes" and "0" for "no". ## Note that "1" will display the record exactly as if it were fully ## preformatted, but it may be slow due to on-the-fly processing; "0" ## will display a default format very fast, but it may not have all ## the fields as in the fully preformatted HTML brief format. Note ## also that this option is active only for old (PHP) formats; the new ## (Python) formats are called on the fly by default anyway, since ## they are much faster. When usure, please set "0" here. CFG_WEBSEARCH_CALL_BIBFORMAT = 0 ## CFG_WEBSEARCH_USE_ALEPH_SYSNOS -- do we want to make old SYSNOs ## visible rather than MySQL's record IDs? You may use this if you ## migrate from a different e-doc system, and you store your old ## system numbers into 970__a. Put "1" for "yes" and "0" for ## "no". Usually you don't want to do that, though. CFG_WEBSEARCH_USE_ALEPH_SYSNOS = 0 ## CFG_WEBSEARCH_I18N_LATEST_ADDITIONS -- Put "1" if you want the ## "Latest Additions" in the web collection pages to show ## internationalized records. Useful only if your brief BibFormat ## templates contains internationalized strings. Otherwise put "0" in ## order not to slow down the creation of latest additions by WebColl. CFG_WEBSEARCH_I18N_LATEST_ADDITIONS = 0 ## CFG_WEBSEARCH_INSTANT_BROWSE -- the number of records to display ## under 'Latest Additions' in the web collection pages. CFG_WEBSEARCH_INSTANT_BROWSE = 10 ## CFG_WEBSEARCH_INSTANT_BROWSE_RSS -- the number of records to ## display in the RSS feed. CFG_WEBSEARCH_INSTANT_BROWSE_RSS = 25 ## CFG_WEBSEARCH_RSS_I18N_COLLECTIONS -- comma-separated list of ## collections that feature an internationalized RSS feed on their ## main seach interface page created by webcoll. Other collections ## will have RSS feed using CFG_SITE_LANG. CFG_WEBSEARCH_RSS_I18N_COLLECTIONS = ## CFG_WEBSEARCH_RSS_TTL -- number of minutes that indicates how long ## a feed cache is valid. CFG_WEBSEARCH_RSS_TTL = 360 ## CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS -- maximum number of request kept ## in cache. If the cache is filled, following request are not cached. CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS = 1000 ## CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD -- up to how many author names ## to print explicitely; for more print "et al". Note that this is ## used in default formatting that is seldomly used, as usually ## BibFormat defines all the format. The value below is only used ## when BibFormat fails, for example. CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD = 3 ## CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS -- whether to show or ## not collection grandsons in Narrow Search boxes (sons are shown by ## default, grandsons are configurable here). Use 0 for no and 1 for ## yes. CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS = 1 ## CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX -- shall we ## create help links for Ellis, Nick or Ellis, Nicholas and friends ## when Ellis, N was searched for? Useful if you have one author ## stored in the database under several name formats, namely surname ## comma firstname and surname comma initial cataloging policy. Use 0 ## for no and 1 for yes. CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX = 1 ## CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS -- MathJax is a JavaScript ## library that renders (La)TeX mathematical formulas in the client ## browser. This parameter must contain a comma-separated list of ## output formats for which to apply the MathJax rendering, for example ## "hb,hd". If the list is empty, MathJax is disabled. CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS = ## CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT -- when searching ## external collections (e.g. SPIRES, CiteSeer, etc), how many seconds ## do we wait for reply before abandonning? CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT = 5 ## CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_MAXRESULTS -- how many ## results do we fetch? CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_MAXRESULTS = 10 ## CFG_WEBSEARCH_SPLIT_BY_COLLECTION -- do we want to split the search ## results by collection or not? Use 0 for not, 1 for yes. CFG_WEBSEARCH_SPLIT_BY_COLLECTION = 1 ## CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS -- the default number of ## records to display per page in the search results pages. CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS = 10 ## CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS -- in order to limit denial of ## service attacks the total number of records per group displayed as a ## result of a search query will be limited to this number. Only the superuser ## queries will not be affected by this limit. CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS = 200 ## CFG_WEBSEARCH_PERMITTED_RESTRICTED_COLLECTIONS_LEVEL -- logged in users ## might have rights to access some restricted collections. This variable ## tweaks the kind of support the system will automatically provide to the ## user with respect to searching into these restricted collections. ## Set this to 0 in order to have the user to explicitly activate restricted ## collections in order to search into them. Set this to 1 in order to ## propose to the user the list of restricted collections to which he/she has ## rights (note: this is not yet implemented). Set this to 2 in order to ## silently add all the restricted collections to which the user has rights to ## to any query. ## Note: the system will discover which restricted collections a user has ## rights to, at login time. The time complexity of this procedure is ## proportional to the number of restricted collections. E.g. for a system ## with ~50 restricted collections, you might expect ~1s of delay in the ## login time, when this variable is set to a value higher than 0. CFG_WEBSEARCH_PERMITTED_RESTRICTED_COLLECTIONS_LEVEL = 0 ## CFG_WEBSEARCH_SHOW_COMMENT_COUNT -- do we want to show the 'N comments' ## links on the search engine pages? (useful only when you have allowed ## commenting) CFG_WEBSEARCH_SHOW_COMMENT_COUNT = 1 ## CFG_WEBSEARCH_SHOW_REVIEW_COUNT -- do we want to show the 'N reviews' ## links on the search engine pages? (useful only when you have allowed ## reviewing) CFG_WEBSEARCH_SHOW_REVIEW_COUNT = 1 ## CFG_WEBSEARCH_FULLTEXT_SNIPPETS -- how many full-text snippets to ## display for full-text searches? CFG_WEBSEARCH_FULLTEXT_SNIPPETS = 4 ## CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS -- how many context words ## to display around the pattern in the snippet? CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS = 4 ## CFG_WEBSEARCH_WILDCARD_LIMIT -- some of the queries, wildcard ## queries in particular (ex: cern*, a*), but also regular expressions ## (ex: [a-z]+), may take a long time to respond due to the high ## number of hits. You can limit the number of terms matched by a ## wildcard by setting this variable. A negative value or zero means ## that none of the queries will be limited (which may be wanted by ## also prone to denial-of-service kind of attacks). CFG_WEBSEARCH_WILDCARD_LIMIT = 50000 ## CFG_WEBSEARCH_SYNONYM_KBRS -- defines which knowledge bases are to ## be used for which index in order to provide runtime synonym lookup ## of user-supplied terms, and what massaging function should be used ## upon search pattern before performing the KB lookup. (Can be one ## of `exact', 'leading_to_comma', `leading_to_number'.) CFG_WEBSEARCH_SYNONYM_KBRS = { 'journal': ['SEARCH-SYNONYM-JOURNAL', 'leading_to_number'], } ## CFG_SOLR_URL -- optionally, you may use Solr to serve full-text ## queries. If so, please specify the URL of your Solr instance. ## (example: http://localhost:8080/sorl) CFG_SOLR_URL = ## CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT -- specify the limit when ## the previous/next/back hit links are to be displayed on detailed record pages. ## In order to speeding up list manipulations, if a search returns lots of hits, ## more than this limit, then do not loose time calculating next/previous/back ## hits at all, but display page directly without these. ## Note also that Invenio installations that do not like ## to have the next/previous hit link functionality would be able to set this ## variable to zero and not see anything. CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT = 10000 ####################################### ## Part 4: BibHarvest OAI parameters ## ####################################### ## This part defines parameters for the Invenio OAI gateway. ## Useful if you are running Invenio as OAI data provider. ## CFG_OAI_ID_FIELD -- OAI identifier MARC field: CFG_OAI_ID_FIELD = 909COo ## CFG_OAI_SET_FIELD -- OAI set MARC field: CFG_OAI_SET_FIELD = 909COp ## CFG_OAI_DELETED_POLICY -- OAI deletedrecordspolicy ## (no/transient/persistent). CFG_OAI_DELETED_POLICY = no ## CFG_OAI_ID_PREFIX -- OAI identifier prefix: CFG_OAI_ID_PREFIX = atlantis.cern.ch ## CFG_OAI_SAMPLE_IDENTIFIER -- OAI sample identifier: CFG_OAI_SAMPLE_IDENTIFIER = oai:atlantis.cern.ch:CERN-TH-4036 ## CFG_OAI_IDENTIFY_DESCRIPTION -- description for the OAI Identify verb: CFG_OAI_IDENTIFY_DESCRIPTION = oai atlantis.cern.ch : oai:atlantis.cern.ch:CERN-TH-4036 http://atlantis.cern.ch/ Free and unlimited use by anybody with obligation to refer to original record Full content, i.e. preprints may not be harvested by robots Submission restricted. Submitted documents are subject of approval by OAI repository admins. ## CFG_OAI_LOAD -- OAI number of records in a response: CFG_OAI_LOAD = 1000 ## CFG_OAI_EXPIRE -- OAI resumptionToken expiration time: CFG_OAI_EXPIRE = 90000 ## CFG_OAI_SLEEP -- service unavailable between two consecutive ## requests for CFG_OAI_SLEEP seconds: CFG_OAI_SLEEP = 10 ################################## ## Part 5: WebSubmit parameters ## ################################## ## This section contains some configuration parameters for WebSubmit ## module. Please note that WebSubmit is mostly configured on ## run-time via its WebSubmit Admin web interface. The parameters ## below are the ones that you do not probably want to modify during ## the runtime. ## CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT -- the fulltext ## documents are stored under "/opt/invenio/var/data/files/gX/Y" ## directories where X is 0,1,... and Y stands for bibdoc ID. Thusly ## documents Y are grouped into directories X and this variable ## indicates the maximum number of documents Y stored in each ## directory X. This limit is imposed solely for filesystem ## performance reasons in order not to have too many subdirectories in ## a given directory. CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT = 5000 ## CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS -- a comma-separated ## list of document extensions not listed in Python standard mimetype ## library that should be recognized by Invenio. CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS = hpg,link,lis,llb,mat,mpp,msg,docx,docm,xlsx,xlsm,xlsb,pptx,pptm,ppsx,ppsm ## CFG_BIBDOCFILE_USE_XSENDFILE -- if your web server supports ## XSendfile header, you may want to enable this feature in order for ## to Invenio tell the web server to stream files for download (after ## proper authorization checks) by web server's means. This helps to ## liberate Invenio worker processes from being busy with sending big ## files to clients. The web server will take care of that. Note: ## this feature is still somewhat experimental. Note: when enabled ## (set to 1), then you have to also regenerate Apache vhost conf ## snippets (inveniocfg --update-config-py --create-apache-conf). CFG_BIBDOCFILE_USE_XSENDFILE = 0 ## CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY -- a number between 0 and ## 1 that indicates probability with which MD5 checksum will be ## verified when streaming bibdocfile-managed files. (0.1 will cause ## the check to be performed once for every 10 downloads) CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY = 0.1 ## CFG_OPENOFFICE_SERVER_HOST -- the host where an OpenOffice Server is ## listening to. If localhost an OpenOffice server will be started ## automatically if it is not already running. ## Note: if you set this to an empty value this will disable the usage of ## OpenOffice for converting documents. ## If you set this to something different than localhost you'll have to take ## care to have an OpenOffice server running on the corresponding host and ## to install the same OpenOffice release both on the client and on the server ## side. ## In order to launch an OpenOffice server on a remote machine, just start ## the usual 'soffice' executable in this way: ## $> soffice -headless -nologo -nodefault -norestore -nofirststartwizard \ ## .. -accept=socket,host=HOST,port=PORT;urp;StarOffice.ComponentContext CFG_OPENOFFICE_SERVER_HOST = localhost ## CFG_OPENOFFICE_SERVER_PORT -- the port where an OpenOffice Server is ## listening to. CFG_OPENOFFICE_SERVER_PORT = 2002 ## CFG_OPENOFFICE_USER -- the user that will be used to launch the OpenOffice ## client. It is recommended to set this to a user who don't own files, like ## e.g. 'nobody'. You should also authorize your Apache server user to be ## able to become this user, e.g. by adding to your /etc/sudoers the following ## line: ## "apache ALL=(nobody) NOPASSWD: ALL" ## provided that apache is the username corresponding to the Apache user. ## On some machine this might be apache2 or www-data. CFG_OPENOFFICE_USER = nobody ################################# ## Part 6: BibIndex parameters ## ################################# ## This section contains some configuration parameters for BibIndex ## module. Please note that BibIndex is mostly configured on run-time ## via its BibIndex Admin web interface. The parameters below are the ## ones that you do not probably want to modify very often during the ## runtime. ## CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY -- when fulltext indexing, do ## you want to index locally stored files only, or also external URLs? ## Use "0" to say "no" and "1" to say "yes". CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY = 0 ## CFG_BIBINDEX_REMOVE_STOPWORDS -- when indexing, do we want to remove ## stopwords? Use "0" to say "no" and "1" to say "yes". CFG_BIBINDEX_REMOVE_STOPWORDS = 0 ## CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS -- characters considered as ## alphanumeric separators of word-blocks inside words. You probably ## don't want to change this. CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS = \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ ## CFG_BIBINDEX_CHARS_PUNCTUATION -- characters considered as punctuation ## between word-blocks inside words. You probably don't want to ## change this. CFG_BIBINDEX_CHARS_PUNCTUATION = \.\,\:\;\?\!\" ## CFG_BIBINDEX_REMOVE_HTML_MARKUP -- should we attempt to remove HTML markup ## before indexing? Use 1 if you have HTML markup inside metadata ## (e.g. in abstracts), use 0 otherwise. CFG_BIBINDEX_REMOVE_HTML_MARKUP = 0 ## CFG_BIBINDEX_REMOVE_LATEX_MARKUP -- should we attempt to remove LATEX markup ## before indexing? Use 1 if you have LATEX markup inside metadata ## (e.g. in abstracts), use 0 otherwise. CFG_BIBINDEX_REMOVE_LATEX_MARKUP = 0 ## CFG_BIBINDEX_MIN_WORD_LENGTH -- minimum word length allowed to be added to ## index. The terms smaller then this amount will be discarded. ## Useful to keep the database clean, however you can safely leave ## this value on 0 for up to 1,000,000 documents. CFG_BIBINDEX_MIN_WORD_LENGTH = 0 ## CFG_BIBINDEX_URLOPENER_USERNAME and CFG_BIBINDEX_URLOPENER_PASSWORD -- ## access credentials to access restricted URLs, interesting only if ## you are fulltext-indexing files located on a remote server that is ## only available via username/password. But it's probably better to ## handle this case via IP or some convention; the current scheme is ## mostly there for demo only. CFG_BIBINDEX_URLOPENER_USERNAME = mysuperuser CFG_BIBINDEX_URLOPENER_PASSWORD = mysuperpass ## CFG_INTBITSET_ENABLE_SANITY_CHECKS -- ## Enable sanity checks for integers passed to the intbitset data ## structures. It is good to enable this during debugging ## and to disable this value for speed improvements. CFG_INTBITSET_ENABLE_SANITY_CHECKS = False ## CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES -- regular expression that matches ## docnames for which OCR is desired (set this to .* in order to enable ## OCR in general, set this to empty in order to disable it.) CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES = scan-.* ## CFG_BIBINDEX_SPLASH_PAGES -- key-value mapping where the key corresponds ## to a regular expression that matches the URLs of the splash pages of ## a given service and the value is a regular expression of the set of URLs ## referenced via tags in the HTML content of the splash pages that are ## referring to documents that need to be indexed. ## NOTE: for backward compatibility reasons you can set this to a simple ## regular expression that will directly be used as the unique key of the ## map, with corresponding value set to ".*" (in order to match any URL) CFG_BIBINDEX_SPLASH_PAGES = { "http://documents\.cern\.ch/setlink\?.*": ".*", "http://ilcagenda\.linearcollider\.org/subContributionDisplay\.py\?.*|http://ilcagenda\.linearcollider\.org/contributionDisplay\.py\?.*": "http://ilcagenda\.linearcollider\.org/getFile\.py/access\?.*|http://ilcagenda\.linearcollider\.org/materialDisplay\.py\?.*", } ## CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES -- do we want ## the author word index to exclude first names to keep only last ## names? If set to True, then for the author `Bernard, Denis', only ## `Bernard' will be indexed in the word index, not `Denis'. Note ## that if you change this variable, you have to re-index the author ## index via `bibindex -w author -R'. CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES = False ## CFG_BIBINDEX_SYNONYM_KBRS -- defines which knowledge bases are to ## be used for which index in order to provide index-time synonym ## lookup, and what massaging function should be used upon search ## pattern before performing the KB lookup. (Can be one of `exact', ## 'leading_to_comma', `leading_to_number'.) CFG_BIBINDEX_SYNONYM_KBRS = { 'global': ['INDEX-SYNONYM-TITLE', 'exact'], 'title': ['INDEX-SYNONYM-TITLE', 'exact'], } ####################################### ## Part 7: Access control parameters ## ####################################### ## This section contains some configuration parameters for the access ## control system. Please note that WebAccess is mostly configured on ## run-time via its WebAccess Admin web interface. The parameters ## below are the ones that you do not probably want to modify very ## often during the runtime. (If you do want to modify them during ## runtime, for example te deny access temporarily because of backups, ## you can edit access_control_config.py directly, no need to get back ## here and no need to redo the make process.) ## CFG_ACCESS_CONTROL_LEVEL_SITE -- defines how open this site is. ## Use 0 for normal operation of the site, 1 for read-only site (all ## write operations temporarily closed), 2 for site fully closed, ## 3 for also disabling any database connection. ## Useful for site maintenance. CFG_ACCESS_CONTROL_LEVEL_SITE = 0 ## CFG_ACCESS_CONTROL_LEVEL_GUESTS -- guest users access policy. Use ## 0 to allow guest users, 1 not to allow them (all users must login). CFG_ACCESS_CONTROL_LEVEL_GUESTS = 0 ## CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS -- account registration and ## activation policy. When 0, users can register and accounts are ## automatically activated. When 1, users can register but admin must ## activate the accounts. When 2, users cannot register nor update ## their email address, only admin can register accounts. When 3, ## users cannot register nor update email address nor password, only ## admin can register accounts. When 4, the same as 3 applies, nor ## user cannot change his login method. When 5, then the same as 4 ## applies, plus info about how to get an account is hidden from the ## login page. CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS = 0 ## CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN -- limit account ## registration to certain email addresses? If wanted, give domain ## name below, e.g. "cern.ch". If not wanted, leave it empty. CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN = ## CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS -- send a ## notification email to the administrator when a new account is ## created? Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS = 0 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT -- send a ## notification email to the user when a new account is created in order to ## to verify the validity of the provided email address? Use ## 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT = 1 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_ACTIVATION -- send a ## notification email to the user when a new account is activated? ## Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_ACTIVATION = 0 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_DELETION -- send a ## notification email to the user when a new account is deleted or ## account demand rejected? Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_DELETION = 0 ## CFG_APACHE_PASSWORD_FILE -- the file where Apache user credentials ## are stored. Must be an absolute pathname. If the value does not ## start by a slash, it is considered to be the filename of a file ## located under prefix/var/tmp directory. This is useful for the ## demo site testing purposes. For the production site, if you plan ## to restrict access to some collections based on the Apache user ## authentication mechanism, you should put here an absolute path to ## your Apache password file. CFG_APACHE_PASSWORD_FILE = demo-site-apache-user-passwords ## CFG_APACHE_GROUP_FILE -- the file where Apache user groups are ## defined. See the documentation of the preceding config variable. CFG_APACHE_GROUP_FILE = demo-site-apache-user-groups ################################### ## Part 8: WebSession parameters ## ################################### ## This section contains some configuration parameters for tweaking ## session handling. ## CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT -- number of days after which a session ## and the corresponding cookie is considered expired. CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT = 2 ## CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER -- number of days after which a session ## and the corresponding cookie is considered expired, when the user has ## requested to permanently stay logged in. CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER = 365 ## CFG_WEBSESSION_RESET_PASSWORD_EXPIRE_IN_DAYS -- when user requested ## a password reset, for how many days is the URL valid? CFG_WEBSESSION_RESET_PASSWORD_EXPIRE_IN_DAYS = 3 ## CFG_WEBSESSION_ADDRESS_ACTIVATION_EXPIRE_IN_DAYS -- when an account ## activation email was sent, for how many days is the URL valid? CFG_WEBSESSION_ADDRESS_ACTIVATION_EXPIRE_IN_DAYS = 3 ## CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS -- when ## user won't confirm his email address and not complete ## registeration, after how many days will it expire? CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS = 10 ## CFG_WEBSESSION_DIFFERENTIATE_BETWEEN_GUESTS -- when set to 1, the session ## system allocates the same uid=0 to all guests users regardless of where they ## come from. 0 allocate a unique uid to each guest. CFG_WEBSESSION_DIFFERENTIATE_BETWEEN_GUESTS = 0 ################################ ## Part 9: BibRank parameters ## ################################ ## This section contains some configuration parameters for the ranking ## system. ## CFG_BIBRANK_SHOW_READING_STATS -- do we want to show reading ## similarity stats? ('People who viewed this page also viewed') CFG_BIBRANK_SHOW_READING_STATS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_STATS -- do we want to show the download ## similarity stats? ('People who downloaded this document also ## downloaded') CFG_BIBRANK_SHOW_DOWNLOAD_STATS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS -- do we want to show download ## history graph? (0=no | 1=classic/gnuplot | 2=flot) CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION -- do we ## want to show a graph representing the distribution of client IPs ## downloading given document? CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION = 0 ## CFG_BIBRANK_SHOW_CITATION_LINKS -- do we want to show the 'Cited ## by' links? (useful only when you have citations in the metadata) CFG_BIBRANK_SHOW_CITATION_LINKS = 1 ## CFG_BIBRANK_SHOW_CITATION_STATS -- de we want to show citation ## stats? ('Cited by M recors', 'Co-cited with N records') CFG_BIBRANK_SHOW_CITATION_STATS = 1 ## CFG_BIBRANK_SHOW_CITATION_GRAPHS -- do we want to show citation ## history graph? (0=no | 1=classic/gnuplot | 2=flot) CFG_BIBRANK_SHOW_CITATION_GRAPHS = 1 #################################### ## Part 10: WebComment parameters ## #################################### ## This section contains some configuration parameters for the ## commenting and reviewing facilities. ## CFG_WEBCOMMENT_ALLOW_COMMENTS -- do we want to allow users write ## public comments on records? CFG_WEBCOMMENT_ALLOW_COMMENTS = 1 ## CFG_WEBCOMMENT_ALLOW_REVIEWS -- do we want to allow users write ## public reviews of records? CFG_WEBCOMMENT_ALLOW_REVIEWS = 1 ## CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS -- do we want to allow short ## reviews, that is just the attribution of stars without submitting ## detailed review text? CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS = 0 ## CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN -- if users ## report a comment to be abusive, how many they have to be before the ## site admin is alerted? CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN = 5 ## CFG_WEBCOMMENT_NB_COMMENTS_IN_DETAILED_VIEW -- how many comments do ## we display in the detailed record page upon welcome? CFG_WEBCOMMENT_NB_COMMENTS_IN_DETAILED_VIEW = 1 ## CFG_WEBCOMMENT_NB_REVIEWS_IN_DETAILED_VIEW -- how many reviews do ## we display in the detailed record page upon welcome? CFG_WEBCOMMENT_NB_REVIEWS_IN_DETAILED_VIEW = 1 ## CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL -- do we notify the site ## admin after every comment? CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL = 1 ## CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS -- how many ## elapsed seconds do we consider enough when checking for possible ## multiple comment submissions by a user? CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS = 20 ## CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS -- how many ## elapsed seconds do we consider enough when checking for possible ## multiple review submissions by a user? CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS = 20 ## CFG_WEBCOMMENT_USE_RICH_EDITOR -- enable the WYSIWYG ## Javascript-based editor when user edits comments? CFG_WEBCOMMENT_USE_RICH_TEXT_EDITOR = False ## CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL -- the email address from which the ## alert emails will appear to be sent: CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = info@invenio-software.org ## CFG_WEBCOMMENT_DEFAULT_MODERATOR -- if no rules are ## specified to indicate who is the comment moderator of ## a collection, this person will be used as default CFG_WEBCOMMENT_DEFAULT_MODERATOR = info@invenio-software.org ## CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS -- do we want to allow the use ## of MathJax plugin to render latex input in comments? CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS = 1 ## CFG_WEBCOMMENT_AUTHOR_DELETE_COMMENT_OPTION -- allow comment author to ## delete its own comment? CFG_WEBCOMMENT_AUTHOR_DELETE_COMMENT_OPTION = 1 # CFG_WEBCOMMENT_EMAIL_REPLIES_TO -- which field of the record define # email addresses that should be notified of newly submitted comments, # and for which collection. Use collection names as keys, and list of # tags as values CFG_WEBCOMMENT_EMAIL_REPLIES_TO = { 'Articles': ['506__d', '506__m'], } # CFG_WEBCOMMENT_RESTRICTION_DATAFIELD -- which field of the record # define the restriction (must be linked to WebAccess # 'viewrestrcomment') to apply to newly submitted comments, and for # which collection. Use collection names as keys, and one tag as value CFG_WEBCOMMENT_RESTRICTION_DATAFIELD = { 'Articles': '5061_a', 'Pictures': '5061_a', 'Theses': '5061_a', } # CFG_WEBCOMMENT_ROUND_DATAFIELD -- which field of the record define # the current round of comment for which collection. Use collection # name as key, and one tag as value CFG_WEBCOMMENT_ROUND_DATAFIELD = { 'Articles': '562__c', 'Pictures': '562__c', } # CFG_WEBCOMMENT_MAX_ATTACHMENT_SIZE -- max file size per attached # file, in bytes. Choose 0 if you don't want to limit the size CFG_WEBCOMMENT_MAX_ATTACHMENT_SIZE = 5242880 # CFG_WEBCOMMENT_MAX_ATTACHED_FILES -- maxium number of files that can # be attached per comment. Choose 0 if you don't want to limit the # number of files. File uploads can be restricted with action # "attachcommentfile". CFG_WEBCOMMENT_MAX_ATTACHED_FILES = 5 # CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH -- how many levels of # indentation discussions can be. This can be used to ensure that # discussions will not go into deep levels of nesting if users don't # understand the difference between "reply to comment" and "add # comment". When the depth is reached, any "reply to comment" is # conceptually converted to a "reply to thread" (i.e. reply to this # parent's comment). Use -1 for no limit, 0 for unthreaded (flat) # discussions. CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH = 1 ################################## ## Part 11: BibSched parameters ## ################################## ## This section contains some configuration parameters for the ## bibliographic task scheduler. ## CFG_BIBSCHED_REFRESHTIME -- how often do we want to refresh ## bibsched monitor? (in seconds) CFG_BIBSCHED_REFRESHTIME = 5 ## CFG_BIBSCHED_LOG_PAGER -- what pager to use to view bibsched task ## logs? CFG_BIBSCHED_LOG_PAGER = /bin/more ## CFG_BIBSCHED_GC_TASKS_OLDER_THAN -- after how many days to perform the ## gargbage collector of BibSched queue (i.e. removing/moving task to archive). CFG_BIBSCHED_GC_TASKS_OLDER_THAN = 30 ## CFG_BIBSCHED_GC_TASKS_TO_REMOVE -- list of BibTask that can be safely ## removed from the BibSched queue once they are DONE. CFG_BIBSCHED_GC_TASKS_TO_REMOVE = bibindex,bibreformat,webcoll,bibrank,inveniogc ## CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE -- list of BibTasks that should be safely ## archived out of the BibSched queue once they are DONE. CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE = bibupload,oaiarchive ## CFG_BIBSCHED_MAX_NUMBER_CONCURRENT_TASKS -- maximum number of BibTasks ## that can run concurrently. ## NOTE: concurrent tasks are still considered as an experimental ## feature. Please keep this value set to 1 on production environments. CFG_BIBSCHED_MAX_NUMBER_CONCURRENT_TASKS = 1 ## CFG_BIBSCHED_PROCESS_USER -- bibsched and bibtask processes must ## usually run under the same identity as the Apache web server ## process in order to share proper file read/write privileges. If ## you want to force some other bibsched/bibtask user, e.g. because ## you are using a local `invenio' user that belongs to your ## `www-data' Apache user group and so shares writing rights with your ## Apache web server process in this way, then please set its username ## identity here. Otherwise we shall check whether your ## bibsched/bibtask processes are run under the same identity as your ## Apache web server process (in which case you can leave the default ## empty value here). CFG_BIBSCHED_PROCESS_USER = ## CFG_BIBSCHED_NODE_TASKS -- specific nodes may be configured to ## run only specific tasks; if you want this, then this variable is a ## dictionary of the form {'hostname1': ['task1', 'task2']}. The ## default is that any node can run any task. CFG_BIBSCHED_NODE_TASKS = {} ################################### ## Part 12: WebBasket parameters ## ################################### ## CFG_WEBBASKET_MAX_NUMBER_OF_DISPLAYED_BASKETS -- a safety limit for ## a maximum number of displayed baskets CFG_WEBBASKET_MAX_NUMBER_OF_DISPLAYED_BASKETS = 20 ## CFG_WEBBASKET_USE_RICH_TEXT_EDITOR -- enable the WYSIWYG ## Javascript-based editor when user edits comments in WebBasket? CFG_WEBBASKET_USE_RICH_TEXT_EDITOR = False ################################## ## Part 13: WebAlert parameters ## ################################## ## This section contains some configuration parameters for the ## automatic email notification alert system. ## CFG_WEBALERT_ALERT_ENGINE_EMAIL -- the email address from which the ## alert emails will appear to be sent: CFG_WEBALERT_ALERT_ENGINE_EMAIL = info@invenio-software.org ## CFG_WEBALERT_MAX_NUM_OF_RECORDS_IN_ALERT_EMAIL -- how many records ## at most do we send in an outgoing alert email? CFG_WEBALERT_MAX_NUM_OF_RECORDS_IN_ALERT_EMAIL = 20 ## CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL -- number of ## chars per line in an outgoing alert email? CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL = 72 ## CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES -- when sending alert ## emails fails, how many times we retry? CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES = 3 ## CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES -- when sending ## alert emails fails, what is the sleeptime between tries? (in ## seconds) CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES = 300 #################################### ## Part 14: WebMessage parameters ## #################################### ## CFG_WEBMESSAGE_MAX_SIZE_OF_MESSAGE -- how large web messages do we ## allow? CFG_WEBMESSAGE_MAX_SIZE_OF_MESSAGE = 20000 ## CFG_WEBMESSAGE_MAX_NB_OF_MESSAGES -- how many messages for a ## regular user do we allow in its inbox? CFG_WEBMESSAGE_MAX_NB_OF_MESSAGES = 30 ## CFG_WEBMESSAGE_DAYS_BEFORE_DELETE_ORPHANS -- how many days before ## we delete orphaned messages? CFG_WEBMESSAGE_DAYS_BEFORE_DELETE_ORPHANS = 60 ################################## ## Part 15: MiscUtil parameters ## ################################## ## CFG_MISCUTIL_SQL_USE_SQLALCHEMY -- whether to use SQLAlchemy.pool ## in the DB engine of Invenio. It is okay to enable this flag ## even if you have not installed SQLAlchemy. Note that Invenio will ## loose some perfomance if this option is enabled. CFG_MISCUTIL_SQL_USE_SQLALCHEMY = False ## CFG_MISCUTIL_SQL_RUN_SQL_MANY_LIMIT -- how many queries can we run ## inside run_sql_many() in one SQL statement? The limit value ## depends on MySQL's max_allowed_packet configuration. CFG_MISCUTIL_SQL_RUN_SQL_MANY_LIMIT = 10000 ## CFG_MISCUTIL_SMTP_HOST -- which server to use as outgoing mail server to ## send outgoing emails generated by the system, for example concerning ## submissions or email notification alerts. CFG_MISCUTIL_SMTP_HOST = localhost ## CFG_MISCUTIL_SMTP_PORT -- which port to use on the outgoing mail server ## defined in the previous step. CFG_MISCUTIL_SMTP_PORT = 25 ## CFG_MISCUTILS_DEFAULT_PROCESS_TIMEOUT -- the default number of seconds after ## which a process launched trough shellutils.run_process_with_timeout will ## be killed. This is useful to catch runaway processes. CFG_MISCUTIL_DEFAULT_PROCESS_TIMEOUT = 300 ## CFG_MATHJAX_HOSTING -- if you plan to use MathJax to display TeX ## formulas on HTML web pages, you can specify whether you wish to use ## 'local' hosting or 'cdn' hosting of MathJax libraries. (If set to ## 'local', you have to run 'make install-mathjax-plugin' as described ## in the INSTALL guide.) If set to 'local', users will use your site ## to download MathJax sources. If set to 'cdn', users will use ## centralized MathJax CDN servers instead. Please note that using ## CDN is suitable only for small institutions or for MathJax ## sponsors; see the MathJax website for more details. (Also, please ## note that if you plan to use MathJax on your site, you have to ## adapt CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS and ## CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS configuration variables ## elsewhere in this file.) CFG_MATHJAX_HOSTING = local ################################# ## Part 16: BibEdit parameters ## ################################# ## CFG_BIBEDIT_TIMEOUT -- when a user edits a record, this record is ## locked to prevent other users to edit it at the same time. ## How many seconds of inactivity before the locked record again will be free ## for other people to edit? CFG_BIBEDIT_TIMEOUT = 3600 ## CFG_BIBEDIT_LOCKLEVEL -- when a user tries to edit a record which there ## is a pending bibupload task for in the queue, this shouldn't be permitted. ## The lock level determines how thouroughly the queue should be investigated ## to determine if this is the case. ## Level 0 - always permits editing, doesn't look at the queue ## (unsafe, use only if you know what you are doing) ## Level 1 - permits editing if there are no queued bibedit tasks for this record ## (safe with respect to bibedit, but not for other bibupload maintenance jobs) ## Level 2 - permits editing if there are no queued bibupload tasks of any sort ## (safe, but may lock more than necessary if many cataloguers around) ## Level 3 - permits editing if no queued bibupload task concerns given record ## (safe, most precise locking, but slow, ## checks for 001/EXTERNAL_SYSNO_TAG/EXTERNAL_OAIID_TAG) ## The recommended level is 3 (default) or 2 (if you use maintenance jobs often). CFG_BIBEDIT_LOCKLEVEL = 3 ## CFG_BIBEDIT_PROTECTED_FIELDS -- a comma-separated list of fields that BibEdit ## will not allow to be added, edited or deleted. Wildcards are not supported, ## but conceptually a wildcard is added at the end of every field specification. ## Examples: ## 500A - protect all MARC fields with tag 500 and first indicator A ## 5 - protect all MARC fields in the 500-series. ## 909C_a - protect subfield a in tag 909 with first indicator C and empty ## second indicator ## Note that 001 is protected by default, but if protection of other ## identifiers or automated fields is a requirement, they should be added to ## this list. CFG_BIBEDIT_PROTECTED_FIELDS = ## CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING -- maximum number of records ## that can be modified instantly using the multi-record editor. Above ## this limit, modifications will only be executed in limited hours. CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING = 2000 ## CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING -- maximum number of records ## that can be send for modification without having a superadmin role. ## If the number of records is between CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING ## and this number, the modifications will take place only in limited hours. CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING = 20000 ## CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING_TIME -- Allowed time to ## execute modifications on records, when the number exceeds ## CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING. CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING_TIME = 22:00-05:00 ################################### ## Part 17: BibUpload parameters ## ################################### ## CFG_BIBUPLOAD_REFERENCE_TAG -- where do we store references? CFG_BIBUPLOAD_REFERENCE_TAG = 999 ## CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG -- where do we store external ## system numbers? Useful for matching when our records come from an ## external digital library system. CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG = 970__a ## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG -- where do we store OAI ID tags ## of harvested records? Useful for matching when we harvest stuff ## via OAI that we do not want to reexport via Invenio OAI; so records ## may have only the source OAI ID stored in this tag (kind of like ## external system number too). CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG = 035__a ## CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG -- where do we store OAI SRC ## tags of harvested records? Useful for matching when we harvest stuff ## via OAI that we do not want to reexport via Invenio OAI; so records ## may have only the source OAI SRC stored in this tag (kind of like ## external system number too). Note that the field should be the same of ## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG = 035__9 ## CFG_BIBUPLOAD_STRONG_TAGS -- a comma-separated list of tags that ## are strong enough to resist the replace mode. Useful for tags that ## might be created from an external non-metadata-like source, ## e.g. the information about the number of copies left. CFG_BIBUPLOAD_STRONG_TAGS = 964 ## CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS -- a comma-separated list ## of tags that contain provenance information that should be checked ## in the bibupload correct mode via matching provenance codes. (Only ## field instances of the same provenance information would be acted ## upon.) Please specify the whole tag info up to subfield codes. CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS = 6531_9 ## CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS -- a comma-separated list of system ## paths from which it is allowed to take fulltextes that will be uploaded via ## FFT (CFG_TMPDIR is included by default). CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS = /tmp,/home ## CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS -- a dictionary containing external ## URLs that can be accessed by Invenio and specific HTTP headers that will be ## used for each URL. ## The keys of the dictionary are regular expressions matching a set of URLs, ## the values are dictionaries of headers as consumed by urllib2.Request. If a ## regular expression matching all URLs is created at the end of the list, it ## means that Invenio will download all URLs. Otherwise Invenio will just ## download authorized URLs. ## CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [ ## ('http://myurl.com/.*', {'User-Agent': 'Me'}), ## ('http://yoururl.com/.*', {'User-Agent': 'You', 'Accept': 'text/plain'}), ## ('http://.*', {'User-Agent': 'Invenio'}), ## ] CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [ ('http://.*', {'User-Agent': 'Invenio'}), ] ## CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE -- do we want to serialize ## internal representation of records (Pythonic record structure) into ## the database? This can improve internal processing speed of some ## operations at the price of somewhat bigger disk space usage. ## If you change this value after some records have already been added ## to your installation, you may want to run: ## $ /opt/invenio/bin/inveniocfg --reset-recstruct-cache ## in order to either erase the cache thus freeing database space, ## or to fill the cache for all records that have not been cached yet. CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE = 1 ## CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY -- a comma-separated list ## indicating which fields match the file names of the documents to be ## uploaded. ## The matching will be done in the same order as the list provided. CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY = reportnumber,recid ## CFG_BATCHUPLOADER_DAEMON_DIR -- Directory where the batchuploader daemon ## will look for the subfolders metadata and document by default. ## If path is relative, CFG_PREFIX will be joined as a prefix CFG_BATCHUPLOADER_DAEMON_DIR = var/batchupload ## CFG_BATCHUPLOADER_WEB_ROBOT_AGENT -- Comma-separated list to specify the ## agents permitted when calling batch uploader web interface ## cdsweb.cern.ch/batchuploader/robotupload ## if using a curl, eg: curl xxx -A invenio_webupload CFG_BATCHUPLOADER_WEB_ROBOT_AGENT = invenio_webupload ## CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS -- Access list specifying for each ## IP address, which collections are allowed using batch uploader robot ## interface. CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS = { '10.0.0.1': ['BOOK', 'REPORT'], # Example 1 '10.0.0.2': ['POETRY', 'PREPRINT'], # Example 2 } #################################### ## Part 18: BibCatalog parameters ## #################################### ## CFG_BIBCATALOG_SYSTEM -- set desired catalog system. For example, RT. CFG_BIBCATALOG_SYSTEM = ## RT CONFIGURATION ## CFG_BIBCATALOG_SYSTEM_RT_CLI -- path to the RT CLI client CFG_BIBCATALOG_SYSTEM_RT_CLI = /usr/bin/rt ## CFG_BIBCATALOG_SYSTEM_RT_URL -- Base URL of the remote RT system CFG_BIBCATALOG_SYSTEM_RT_URL = http://localhost/rt3 ## CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER -- Set the username for a default RT account ## on remote system, with limited privileges, in order to only create and modify own tickets. CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER = ## CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD -- Set the password for the default RT account ## on remote system. CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD = #################################### ## Part 19: BibFormat parameters ## #################################### ## CFG_BIBFORMAT_HIDDEN_TAGS -- comma-separated list of MARC tags that ## are not shown to users not having cataloging authorizations. CFG_BIBFORMAT_HIDDEN_TAGS = 595 ## CFG_BIBFORMAT_ADDTHIS_ID -- if you want to use the AddThis service from ## , set this value to the pubid parameter as ## provided by the service (e.g. ra-4ff80aae118f4dad). ## See also the bfe_addthis.py BibFormat element. CFG_BIBFORMAT_ADDTHIS_ID = #################################### ## Part 20: BibMatch parameters ## #################################### ## CFG_BIBMATCH_LOCAL_SLEEPTIME -- Determines the amount of seconds to sleep ## between search queries on LOCAL system. CFG_BIBMATCH_LOCAL_SLEEPTIME = 0.0 ## CFG_BIBMATCH_REMOTE_SLEEPTIME -- Determines the amount of seconds to sleep ## between search queries on REMOTE systems. CFG_BIBMATCH_REMOTE_SLEEPTIME = 2.0 ## CFG_BIBMATCH_FUZZY_WORDLIMITS -- Determines the amount of words to extract ## from a certain fields value during fuzzy matching mode. Add/change field ## and appropriate number to the dictionary to configure this. CFG_BIBMATCH_FUZZY_WORDLIMITS = { '100__a': 2, '245__a': 4 } ## CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT -- Determines the amount of empty results ## to accept during fuzzy matching mode. CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT = 1 ## CFG_BIBMATCH_QUERY_TEMPLATES -- Here you can set the various predefined querystrings ## used to standardize common matching queries. By default the following templates ## are given: ## title - standard title search. Taken from 245__a (default) ## title-author - title and author search (i.e. this is a title AND author a) ## Taken from 245__a and 100__a ## reportnumber - reportnumber search (i.e. reportnumber:REP-NO-123). CFG_BIBMATCH_QUERY_TEMPLATES = { 'title' : '[title]', 'title-author' : '[title] [author]', 'reportnumber' : 'reportnumber:[reportnumber]' } ###################################### ## Part 21: BibAuthorID parameters ## ###################################### # CFG_BIBAUTHORID_MAX_PROCESSES is the max number of processes # that may be spawned by the disambiguation algorithm -CFG_BIBAUTHORID_MAX_PROCESSES = 4 +CFG_BIBAUTHORID_MAX_PROCESSES = 12 # CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS is the max number of threads # to parallelize sql queries during personID tables updates -CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS = 4 +CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS = 12 # CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_BCTKD_RA is the minimum confidence needed # when backtracking automatically disambiguated authors to persons. # Values in [0,1] CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_BCTKD_RA = 0.5 # CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_NEW_RA is the threshold for # the confidence in a paper by the disambiguation algorithm to have it # automatically connected to a personID. Papers below the thresholds are # left disconnected from persons if not already connected in other ways. # values in [0,1] CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_NEW_RA = 0.5 # CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH minimum threshold for # disambiguated authors and persons: if less compatible than this the update # process will create a new person to associate to the found disambiguated author. CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH = 0.5 # CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N is a fallback mechanism # to force a merge if a certain percentage of papers is compatible no matter # what the confidences on the automatically disambiguated author looks like CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N = 0.5 # CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY defines the user info # keys for externally claimed records in an remote-login scenario--e.g. from arXiv.org # e.g. "external_arxivids" for arXiv SSO CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY = # CFG_BIBAUTHORID_ATTACH_VA_TO_MULTIPLE_RAS determines if the authorid # algorithm is allowed to attach a virtual author to multiple # real authors in the last run of the orphan processing. # Comma separated list of values. CFG_BIBAUTHORID_ATTACH_VA_TO_MULTIPLE_RAS = False # CFG_BIBAUTHORID_AID_ENABLED # Globally enable AuthorID Interfaces. # If False: No guest, user or operator will have access to the system. CFG_BIBAUTHORID_ENABLED = True # CFG_BIBAUTHORID_AID_ON_AUTHORPAGES # Enable AuthorID information on the author pages. CFG_BIBAUTHORID_ON_AUTHORPAGES = True # CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL defines the eMail address # all ticket requests concerning authors will be sent to. CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL = info@invenio-software.org +#CFG_BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE defines if the optional arXive stub page is skipped +CFG_BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE = True + ###################################### ## Part 22: BibClassify parameters ## ###################################### # CFG_BIBCLASSIFY_WEB_MAXKW -- maximum number of keywords to display # in the Keywords tab web page. CFG_BIBCLASSIFY_WEB_MAXKW = 100 ######################################## ## Part 23: Plotextractor parameters ## ######################################## ## CFG_PLOTEXTRACTOR_SOURCE_BASE_URL -- for acquiring source tarballs for plot ## extraction, where should we look? If nothing is set, we'll just go ## to arXiv, but this can be a filesystem location, too CFG_PLOTEXTRACTOR_SOURCE_BASE_URL = http://arxiv.org/ ## CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER -- for acquiring source tarballs for plot ## extraction, subfolder where the tarballs sit CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER = e-print/ ## CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER -- for acquiring source tarballs for plot ## extraction, subfolder where the pdf sit CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER = pdf/ ## CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT -- a float representing the number of seconds ## to wait between each download of pdf and/or tarball from source URL. CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT = 2.0 ## CFG_PLOTEXTRACTOR_CONTEXT_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of characters in each direction to extract ## context from. Default 750. CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT = 750 ## CFG_PLOTEXTRACTOR_DISALLOWED_TEX -- when extracting context of plots from TeX ## sources, this is the list of TeX tags that will trigger 'end of context'. CFG_PLOTEXTRACTOR_DISALLOWED_TEX = begin,end,section,includegraphics,caption,acknowledgements ## CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of words in each direction. Default 75. CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT = 75 ## CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of sentences in each direction. Default 2. CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT = 2 ########################## ## THAT's ALL, FOLKS! ## ########################## diff --git a/configure.ac b/configure.ac index b8218a21f..6aacda389 100644 --- a/configure.ac +++ b/configure.ac @@ -1,880 +1,882 @@ ## This file is part of Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ## This is Invenio main configure.ac file. If you change this ## file, then please run "autoreconf" to regenerate the "configure" ## script. ## Initialize autoconf and automake: AC_INIT([invenio], m4_esyscmd([./git-version-gen .tarball-version]), [info@invenio-software.org]) AM_INIT_AUTOMAKE([tar-ustar]) ## By default we shall install into /opt/invenio. (Do not use ## AC_PREFIX_DEFAULT for this, because it would not work well with ## the localstatedir hack below.) test "${prefix}" = NONE && prefix=/opt/invenio ## Remove eventual trailing slashes from the prefix value: test "${prefix%/}" != "" && prefix=${prefix%/} ## Check for install: AC_PROG_INSTALL ## Check for gettext support: AM_GNU_GETTEXT(external) AM_GNU_GETTEXT_VERSION(0.14.4) ## Check for MySQL client: AC_MSG_CHECKING(for mysql) AC_ARG_WITH(mysql, AC_HELP_STRING([--with-mysql], [path to a specific MySQL binary (optional)]), MYSQL=${withval}) if test -n "$MYSQL"; then AC_MSG_RESULT($MYSQL) else AC_PATH_PROG(MYSQL, mysql) if test -z "$MYSQL"; then AC_MSG_ERROR([ MySQL command-line client was not found in your PATH. Please install it first. Available from .]) fi fi ## Check for Python: AC_MSG_CHECKING(for python) AC_ARG_WITH(python, AC_HELP_STRING([--with-python], [path to a specific Python binary (optional)]), PYTHON=${withval}) if test -n "$PYTHON"; then AC_MSG_RESULT($PYTHON) else AC_PATH_PROG(PYTHON, python) if test -z "$PYTHON"; then AC_MSG_ERROR([ Python was not found in your PATH. Please either install it in your PATH or specify --with-python configure option. Python is available from .]) fi fi ## Check for OpenOffice.org Python binary: AC_MSG_CHECKING(for OpenOffice.org Python binary) AC_ARG_WITH(openoffice-python, AC_HELP_STRING([--with-openoffice-python], [path to a specific OpenOffice.org Python binary (optional)]), OPENOFFICE_PYTHON=`which ${withval}`) if test -z "$OPENOFFICE_PYTHON"; then OPENOFFICE_PYTHON=`locate -n 1 -r "o.*office/program/python$"` OPENOFFICE_PYTHON="$PYTHON $OPENOFFICE_PYTHON" if test -n "$OPENOFFICE_PYTHON" && ($OPENOFFICE_PYTHON -c "import uno" 2> /dev/null); then AC_MSG_RESULT($OPENOFFICE_PYTHON) else AC_MSG_WARN([ You have not specified the path ot the OpenOffice.org Python binary. OpenOffice.org and Microsoft Office document conversion and fulltext indexing will not be available. We recommend you to install OpenOffice.org first and to rerun the configure script. OpenOffice.org is available from .]) fi elif ($OPENOFFICE_PYTHON -c "import uno" 2> /dev/null); then AC_MSG_RESULT($OPENOFFICE_PYTHON) else AC_MSG_ERROR([ The specified OpenOffice.org Python binary is not correctly configured. Please specify the correct path to the specific OpenOffice Python binary (OpenOffice.org is available from ).]) fi ## Check for Python version and modules: AC_MSG_CHECKING(for required Python modules) $PYTHON ${srcdir}/configure-tests.py if test $? -ne 0; then AC_MSG_ERROR([Please fix the above Python problem before continuing.]) fi AC_MSG_RESULT(found) ## Check for PHP: AC_PATH_PROG(PHP, php) ## Check for gzip: AC_PATH_PROG(GZIP, gzip) if test -z "$GZIP"; then AC_MSG_WARN([ Gzip was not found in your PATH. It is used in the WebSubmit module to compress the data submitted in an archive. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. Gzip is available from .]) fi ## Check for gunzip: AC_PATH_PROG(GUNZIP, gunzip) if test -z "$GUNZIP"; then AC_MSG_WARN([ Gunzip was not found in your PATH. It is used in the WebSubmit module to correctly deal with submitted compressed files. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. Gunzip is available from .]) fi ## Check for tar: AC_PATH_PROG(TAR, tar) if test -z "$TAR"; then AC_MSG_WARN([ Tar was not found in your PATH. It is used in the WebSubmit module to pack the submitted data into an archive. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. Tar is available from .]) fi ## Check for wget: AC_PATH_PROG(WGET, wget) if test -z "$WGET"; then AC_MSG_WARN([ wget was not found in your PATH. It is used for the fulltext file retrieval. You can continue without it but we recomend you to install it first and to rerun the configure script. wget is available from .]) fi ## Check for md5sum: AC_PATH_PROG(MD5SUM, md5sum) if test -z "$MD5SUM"; then AC_MSG_WARN([ md5sum was not found in your PATH. It is used for the fulltext file checksum verification. You can continue without it but we recomend you to install it first and to rerun the configure script. md5sum is available from .]) fi ## Check for ps2pdf: AC_PATH_PROG(PS2PDF, ps2pdf) if test -z "$PS2PDF"; then AC_MSG_WARN([ ps2pdf was not found in your PATH. It is used in the WebSubmit module to convert submitted PostScripts into PDF. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. ps2pdf is available from .]) fi ## Check for pdflatex: AC_PATH_PROG(PDFLATEX, pdflatex) if test -z "$PDFLATEX"; then AC_MSG_WARN([ pdflatex was not found in your PATH. It is used in the WebSubmit module to stamp PDF files. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script.]) fi ## Check for tiff2pdf: AC_PATH_PROG(TIFF2PDF, tiff2pdf) if test -z "$TIFF2PDF"; then AC_MSG_WARN([ tiff2pdf was not found in your PATH. It is used in the WebSubmit module to convert submitted TIFF file into PDF. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. tiff2pdf is available from .]) fi ## Check for gs: AC_PATH_PROG(GS, gs) if test -z "$GS"; then AC_MSG_WARN([ gs was not found in your PATH. It is used in the WebSubmit module to convert submitted PostScripts into PDF. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. gs is available from .]) fi ## Check for pdftotext: AC_PATH_PROG(PDFTOTEXT, pdftotext) if test -z "$PDFTOTEXT"; then AC_MSG_WARN([ pdftotext was not found in your PATH. It is used for the fulltext indexation of PDF files. You can continue without it but you may miss fulltext searching capability of Invenio. We recomend you to install it first and to rerun the configure script. pdftotext is available from . ]) fi ## Check for pdftotext: AC_PATH_PROG(PDFINFO, pdfinfo) if test -z "$PDFINFO"; then AC_MSG_WARN([ pdfinfo was not found in your PATH. It is used for gathering information on PDF files. You can continue without it but you may miss this feature of Invenio. We recomend you to install it first and to rerun the configure script. pdftotext is available from . ]) fi ## Check for pdftk: AC_PATH_PROG(PDFTK, pdftk) if test -z "$PDFTK"; then AC_MSG_WARN([ pdftk was not found in your PATH. It is used for the fulltext file stamping. You can continue without it but you may miss this feature of Invenio. We recomend you to install it first and to rerun the configure script. pdftk is available from . ]) fi ## Check for pdf2ps: AC_PATH_PROG(PDF2PS, pdf2ps) if test -z "$PDF2PS"; then AC_MSG_WARN([ pdf2ps was not found in your PATH. It is used in the WebSubmit module to convert submitted PDFs into PostScript. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. pdf2ps is available from .]) fi ## Check for pdftops: AC_PATH_PROG(PDFTOPS, pdftops) if test -z "$PDFTOPS"; then AC_MSG_WARN([ pdftops was not found in your PATH. It is used in the WebSubmit module to convert submitted PDFs into PostScript. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. pdftops is available from .]) fi ## Check for pdfopt: AC_PATH_PROG(PDFOPT, pdfopt) if test -z "$PDFOPT"; then AC_MSG_WARN([ pdfopt was not found in your PATH. It is used in the WebSubmit module to linearized submitted PDFs. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. pdfopt is available from .]) fi ## Check for pdfimages: AC_PATH_PROG(PDFTOPPM, pdftoppm) if test -z "$PDFTOPPM"; then AC_MSG_WARN([ pdftoppm was not found in your PATH. It is used in the WebSubmit module to extract images from PDFs for OCR. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. pdftoppm is available from .]) fi ## Check for pdfimages: AC_PATH_PROG(PAMFILE, pdftoppm) if test -z "$PAMFILE"; then AC_MSG_WARN([ pamfile was not found in your PATH. It is used in the WebSubmit module to retrieve the size of images extracted from PDFs for OCR. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. pamfile is available as part of the netpbm utilities from: .]) fi ## Check for ocroscript: AC_PATH_PROG(OCROSCRIPT, ocroscript) if test -z "$OCROSCRIPT"; then AC_MSG_WARN([ If you plan to run OCR on your PDFs, then please install ocroscript now. Otherwise you can safely continue. You have also an option to install ocroscript later and edit invenio-local.conf to let Invenio know the path to ocroscript. ocroscript is available as part of OCROpus from . NOTE: Since OCROpus is being actively developed and its api is continuosly changing, please install relase 0.3.1]) fi ## Check for pstotext: AC_PATH_PROG(PSTOTEXT, pstotext) if test -z "$PSTOTEXT"; then AC_MSG_WARN([ pstotext was not found in your PATH. It is used for the fulltext indexation of PDF and PostScript files. Please install pstotext. Otherwise you can safely continue. You have also an option to install pstotext later and edit invenio-local.conf to let Invenio know the path to pstotext. pstotext is available from . ]) fi ## Check for ps2ascii: AC_PATH_PROG(PSTOASCII, ps2ascii) if test -z "$PSTOASCII"; then AC_MSG_WARN([ ps2ascii was not found in your PATH. It is used for the fulltext indexation of PostScript files. Please install ps2ascii. Otherwise you can safely continue. You have also an option to install ps2ascii later and edit invenio-local.conf to let Invenio know the path to ps2ascii. ps2ascii is available from . ]) fi ## Check for any2djvu: AC_PATH_PROG(ANY2DJVU, any2djvu) if test -z "$ANY2DJVU"; then AC_MSG_WARN([ any2djvu was not found in your PATH. It is used in the WebSubmit module to convert documents to DJVU. Please install any2djvu. Otherwise you can safely continue. You have also an option to install any2djvu later and edit invenio-local.conf to let Invenio know the path to any2djvu. any2djvu is available from .]) fi ## Check for DJVUPS: AC_PATH_PROG(DJVUPS, djvups) if test -z "$DJVUPS"; then AC_MSG_WARN([ djvups was not found in your PATH. It is used in the WebSubmit module to convert documents from DJVU. Please install djvups. Otherwise you can safely continue. You have also an option to install djvups later and edit invenio-local.conf to let Invenio know the path to djvups. djvups is available from .]) fi ## Check for DJVUTXT: AC_PATH_PROG(DJVUTXT, djvutxt) if test -z "$DJVUTXT"; then AC_MSG_WARN([ djvutxt was not found in your PATH. It is used in the WebSubmit module to extract text from DJVU documents. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. djvutxt is available from .]) fi ## Check for file: AC_PATH_PROG(FILE, file) if test -z "$FILE"; then AC_MSG_WARN([ File was not found in your PATH. It is used in the WebSubmit module to check the validity of the submitted files. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. File is available from .]) fi ## Check for convert: AC_PATH_PROG(CONVERT, convert) if test -z "$CONVERT"; then AC_MSG_WARN([ Convert was not found in your PATH. It is used in the WebSubmit module to create an icon from a submitted picture. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. Convert is available from .]) fi ## Check for CLISP: AC_MSG_CHECKING(for clisp) AC_ARG_WITH(clisp, AC_HELP_STRING([--with-clisp], [path to a specific CLISP binary (optional)]), CLISP=${withval}) if test -n "$CLISP"; then AC_MSG_RESULT($CLISP) else AC_PATH_PROG(CLISP, clisp) if test -z "$CLISP"; then AC_MSG_WARN([ GNU CLISP was not found in your PATH. It is used by the WebStat module to produce statistics about Invenio usage. (Alternatively, SBCL or CMUCL can be used instead of CLISP.) You can continue without it but you will miss this feature. We recommend you to install it first (if you don't have neither CMUCL nor SBCL) and to rerun the configure script. GNU CLISP is available from .]) fi fi ## Check for CMUCL: AC_MSG_CHECKING(for cmucl) AC_ARG_WITH(cmucl, AC_HELP_STRING([--with-cmucl], [path to a specific CMUCL binary (optional)]), CMUCL=${withval}) if test -n "$CMUCL"; then AC_MSG_RESULT($CMUCL) else AC_PATH_PROG(CMUCL, cmucl) if test -z "$CMUCL"; then AC_MSG_CHECKING(for lisp) # CMUCL can also be installed under `lisp' exec name AC_PATH_PROG(CMUCL, lisp) fi if test -z "$CMUCL"; then AC_MSG_WARN([ CMUCL was not found in your PATH. It is used by the WebStat module to produce statistics about Invenio usage. (Alternatively, CLISP or SBCL can be used instead of CMUCL.) You can continue without it but you will miss this feature. We recommend you to install it first (if you don't have neither CLISP nor SBCL) and to rerun the configure script. CMUCL is available from .]) fi fi ## Check for SBCL: AC_MSG_CHECKING(for sbcl) AC_ARG_WITH(sbcl, AC_HELP_STRING([--with-sbcl], [path to a specific SBCL binary (optional)]), SBCL=${withval}) if test -n "$SBCL"; then AC_MSG_RESULT($SBCL) else AC_PATH_PROG(SBCL, sbcl) if test -z "$SBCL"; then AC_MSG_WARN([ SBCL was not found in your PATH. It is used by the WebStat module to produce statistics about Invenio usage. (Alternatively, CLISP or CMUCL can be used instead of SBCL.) You can continue without it but you will miss this feature. We recommend you to install it first (if you don't have neither CLISP nor CMUCL) and to rerun the configure script. SBCL is available from .]) fi fi ## Check for gnuplot: AC_PATH_PROG(GNUPLOT, gnuplot) if test -z "$GNUPLOT"; then AC_MSG_WARN([ Gnuplot was not found in your PATH. It is used by the BibRank module to produce graphs about download and citation history. You can continue without it but you will miss these graphs. We recommend you to install it first and to rerun the configure script. Gnuplot is available from .]) fi ## Check for ffmpeg: AC_PATH_PROG(FFMPEG, ffmpeg) AC_PATH_PROG(FFPROBE, ffprobe) if test -z "$FFMPEG"; then AC_MSG_WARN([ FFmpeg was not found in your PATH. It is used by the BibEncode module to for video encoding. You can continue without but you will not be able to use BibEncode and no video submission workflows are thereby possible. We recommend you to install it first if you would like to support video submissions and to rerun the configure script. FFmpeg is available from .]) fi ## Check for mediainfo: AC_PATH_PROG(MEDIAINFO, mediainfo) if test -z "$MEDIAINFO"; then AC_MSG_WARN([ Mediainfo was not found in your PATH. It is used by the BibEncode module to for video encoding and media metadata handling. You can continue without but you will not be able to use BibEncode and no video submission workflows are thereby possible. We recommend you to install it first if you would like to support video submissions and to rerun the configure script. Mediainfo is available from .]) fi ## Check for ffmpeg ## Substitute variables: AC_SUBST(VERSION) AC_SUBST(OPENOFFICE_PYTHON) AC_SUBST(MYSQL) AC_SUBST(PYTHON) AC_SUBST(GZIP) AC_SUBST(GUNZIP) AC_SUBST(TAR) AC_SUBST(WGET) AC_SUBST(MD5SUM) AC_SUBST(PS2PDF) AC_SUBST(GS) AC_SUBST(PDFTOTEXT) AC_SUBST(PDFTK) AC_SUBST(PDF2PS) AC_SUBST(PDFTOPS) AC_SUBST(PDFOPT) AC_SUBST(PDFTOPPM) AC_SUBST(OCROSCRIPT) AC_SUBST(PSTOTEXT) AC_SUBST(PSTOASCII) AC_SUBST(ANY2DJVU) AC_SUBST(DJVUPS) AC_SUBST(DJVUTXT) AC_SUBST(FILE) AC_SUBST(CONVERT) AC_SUBST(GNUPLOT) AC_SUBST(CLISP) AC_SUBST(CMUCL) AC_SUBST(SBCL) AC_SUBST(CACHEDIR) AC_SUBST(FFMPEG) AC_SUBST(MEDIAINFO) AC_SUBST(FFPROBE) AC_SUBST(localstatedir, `eval echo "${localstatedir}"`) ## Define output files: AC_CONFIG_FILES([config.nice \ Makefile \ po/Makefile.in \ config/Makefile \ config/invenio-autotools.conf \ modules/Makefile \ modules/bibauthorid/Makefile \ modules/bibauthorid/bin/Makefile \ modules/bibauthorid/bin/bibauthorid \ modules/bibauthorid/doc/Makefile \ modules/bibauthorid/doc/admin/Makefile \ modules/bibauthorid/doc/hacking/Makefile \ modules/bibauthorid/lib/Makefile \ modules/bibauthorid/lib/bibauthorid_comparison_functions/Makefile \ + modules/bibauthorid/etc/Makefile \ + modules/bibauthorid/etc/name_authority_files/Makefile \ modules/bibauthorid/web/Makefile \ modules/bibcatalog/Makefile \ modules/bibcatalog/doc/Makefile \ modules/bibcatalog/doc/admin/Makefile \ modules/bibcatalog/doc/hacking/Makefile modules/bibcatalog/lib/Makefile \ modules/bibcheck/Makefile \ modules/bibcheck/doc/Makefile \ modules/bibcheck/doc/admin/Makefile \ modules/bibcheck/doc/hacking/Makefile \ modules/bibcheck/etc/Makefile \ modules/bibcheck/web/Makefile \ modules/bibcheck/web/admin/Makefile \ modules/bibcirculation/Makefile \ modules/bibcirculation/bin/Makefile \ modules/bibcirculation/doc/Makefile \ modules/bibcirculation/doc/admin/Makefile \ modules/bibcirculation/doc/hacking/Makefile modules/bibcirculation/lib/Makefile \ modules/bibcirculation/web/Makefile \ modules/bibcirculation/web/admin/Makefile \ modules/bibclassify/Makefile \ modules/bibclassify/bin/Makefile \ modules/bibclassify/bin/bibclassify \ modules/bibclassify/doc/Makefile \ modules/bibclassify/doc/admin/Makefile \ modules/bibclassify/doc/hacking/Makefile \ modules/bibclassify/etc/Makefile \ modules/bibclassify/lib/Makefile \ modules/bibconvert/Makefile \ modules/bibconvert/bin/Makefile \ modules/bibconvert/bin/bibconvert \ modules/bibconvert/doc/Makefile \ modules/bibconvert/doc/admin/Makefile \ modules/bibconvert/doc/hacking/Makefile \ modules/bibconvert/etc/Makefile \ modules/bibconvert/lib/Makefile \ modules/bibedit/Makefile \ modules/bibedit/bin/Makefile \ modules/bibedit/bin/bibedit \ modules/bibedit/bin/refextract \ modules/bibedit/bin/xmlmarc2textmarc \ modules/bibedit/bin/textmarc2xmlmarc \ modules/bibedit/bin/xmlmarclint \ modules/bibedit/doc/Makefile \ modules/bibedit/doc/admin/Makefile \ modules/bibedit/doc/hacking/Makefile \ modules/bibedit/etc/Makefile \ modules/bibedit/lib/Makefile \ modules/bibedit/web/Makefile \ modules/bibencode/Makefile \ modules/bibencode/bin/Makefile \ modules/bibencode/bin/bibencode \ modules/bibencode/lib/Makefile \ modules/bibencode/etc/Makefile \ modules/bibencode/www/Makefile \ modules/bibexport/Makefile \ modules/bibexport/bin/Makefile \ modules/bibexport/bin/bibexport \ modules/bibexport/doc/Makefile \ modules/bibexport/doc/admin/Makefile \ modules/bibexport/doc/hacking/Makefile modules/bibexport/etc/Makefile \ modules/bibexport/lib/Makefile \ modules/bibexport/web/Makefile \ modules/bibexport/web/admin/Makefile \ modules/bibformat/Makefile \ modules/bibformat/bin/Makefile \ modules/bibformat/bin/bibreformat \ modules/bibformat/doc/Makefile \ modules/bibformat/doc/admin/Makefile \ modules/bibformat/doc/hacking/Makefile \ modules/bibformat/etc/Makefile \ modules/bibformat/etc/format_templates/Makefile \ modules/bibformat/etc/output_formats/Makefile \ modules/bibformat/lib/Makefile \ modules/bibformat/lib/elements/Makefile \ modules/bibformat/web/Makefile \ modules/bibformat/web/admin/Makefile \ modules/bibharvest/Makefile \ modules/bibharvest/bin/Makefile \ modules/bibharvest/bin/oairepositoryupdater \ modules/bibharvest/bin/oaiharvest \ modules/bibharvest/doc/Makefile \ modules/bibharvest/doc/admin/Makefile \ modules/bibharvest/doc/hacking/Makefile \ modules/bibharvest/lib/Makefile \ modules/bibharvest/web/Makefile \ modules/bibharvest/web/admin/Makefile \ modules/bibindex/Makefile \ modules/bibindex/bin/Makefile \ modules/bibindex/bin/bibindex \ modules/bibindex/bin/bibstat \ modules/bibindex/doc/Makefile \ modules/bibindex/doc/admin/Makefile \ modules/bibindex/doc/hacking/Makefile \ modules/bibindex/lib/Makefile \ modules/bibindex/web/Makefile \ modules/bibindex/web/admin/Makefile \ modules/bibknowledge/Makefile \ modules/bibknowledge/lib/Makefile \ modules/bibknowledge/doc/Makefile \ modules/bibknowledge/doc/admin/Makefile \ modules/bibknowledge/doc/hacking/Makefile \ modules/bibmatch/Makefile \ modules/bibmatch/bin/Makefile \ modules/bibmatch/bin/bibmatch \ modules/bibmatch/doc/Makefile \ modules/bibmatch/doc/admin/Makefile \ modules/bibmatch/etc/Makefile \ modules/bibmatch/lib/Makefile \ modules/bibmerge/Makefile \ modules/bibmerge/bin/Makefile \ modules/bibmerge/doc/Makefile \ modules/bibmerge/doc/admin/Makefile \ modules/bibmerge/doc/hacking/Makefile \ modules/bibmerge/lib/Makefile \ modules/bibmerge/web/Makefile \ modules/bibmerge/web/admin/Makefile \ modules/bibrank/Makefile \ modules/bibrank/bin/Makefile \ modules/bibrank/bin/bibrank \ modules/bibrank/bin/bibrankgkb \ modules/bibrank/doc/Makefile \ modules/bibrank/doc/admin/Makefile \ modules/bibrank/doc/hacking/Makefile \ modules/bibrank/etc/Makefile \ modules/bibrank/etc/bibrankgkb.cfg \ modules/bibrank/etc/demo_jif.cfg \ modules/bibrank/etc/template_single_tag_rank_method.cfg \ modules/bibrank/lib/Makefile \ modules/bibrank/web/Makefile \ modules/bibrank/web/admin/Makefile \ modules/bibsched/Makefile \ modules/bibsched/bin/Makefile \ modules/bibsched/bin/bibsched \ modules/bibsched/bin/bibtaskex \ modules/bibsched/bin/bibtasklet \ modules/bibsched/doc/Makefile \ modules/bibsched/doc/admin/Makefile \ modules/bibsched/doc/hacking/Makefile \ modules/bibsched/lib/Makefile \ modules/bibsched/lib/tasklets/Makefile \ modules/bibupload/Makefile \ modules/bibsword/Makefile \ modules/bibsword/bin/Makefile \ modules/bibsword/bin/bibsword \ modules/bibsword/doc/Makefile \ modules/bibsword/doc/admin/Makefile \ modules/bibsword/doc/hacking/Makefile \ modules/bibsword/lib/Makefile \ modules/bibsword/etc/Makefile \ modules/bibupload/bin/Makefile \ modules/bibupload/bin/bibupload \ modules/bibupload/bin/batchuploader \ modules/bibupload/doc/Makefile \ modules/bibupload/doc/admin/Makefile \ modules/bibupload/doc/hacking/Makefile \ modules/bibupload/lib/Makefile \ modules/elmsubmit/Makefile \ modules/elmsubmit/bin/Makefile \ modules/elmsubmit/bin/elmsubmit \ modules/elmsubmit/doc/Makefile \ modules/elmsubmit/doc/admin/Makefile \ modules/elmsubmit/doc/hacking/Makefile \ modules/elmsubmit/etc/Makefile \ modules/elmsubmit/etc/elmsubmit.cfg \ modules/elmsubmit/lib/Makefile \ modules/miscutil/Makefile \ modules/miscutil/bin/Makefile \ modules/miscutil/bin/dbdump \ modules/miscutil/bin/dbexec \ modules/miscutil/bin/inveniocfg \ modules/miscutil/bin/plotextractor \ modules/miscutil/demo/Makefile \ modules/miscutil/doc/Makefile \ modules/miscutil/doc/hacking/Makefile \ modules/miscutil/etc/Makefile \ modules/miscutil/etc/bash_completion.d/Makefile \ modules/miscutil/etc/bash_completion.d/inveniocfg \ modules/miscutil/etc/ckeditor_scientificchar/Makefile \ modules/miscutil/etc/ckeditor_scientificchar/dialogs/Makefile \ modules/miscutil/etc/ckeditor_scientificchar/lang/Makefile \ modules/miscutil/lib/Makefile \ modules/miscutil/sql/Makefile \ modules/miscutil/web/Makefile \ modules/webaccess/Makefile \ modules/webaccess/bin/Makefile \ modules/webaccess/bin/authaction \ modules/webaccess/bin/webaccessadmin \ modules/webaccess/doc/Makefile \ modules/webaccess/doc/admin/Makefile \ modules/webaccess/doc/hacking/Makefile \ modules/webaccess/lib/Makefile \ modules/webaccess/web/Makefile \ modules/webaccess/web/admin/Makefile \ modules/webalert/Makefile \ modules/webalert/bin/Makefile \ modules/webalert/bin/alertengine \ modules/webalert/doc/Makefile \ modules/webalert/doc/admin/Makefile \ modules/webalert/doc/hacking/Makefile \ modules/webalert/lib/Makefile \ modules/webalert/web/Makefile \ modules/webbasket/Makefile \ modules/webbasket/doc/Makefile \ modules/webbasket/doc/admin/Makefile \ modules/webbasket/doc/hacking/Makefile \ modules/webbasket/lib/Makefile \ modules/webbasket/web/Makefile \ modules/webcomment/Makefile \ modules/webcomment/doc/Makefile \ modules/webcomment/doc/admin/Makefile \ modules/webcomment/doc/hacking/Makefile \ modules/webcomment/lib/Makefile \ modules/webcomment/web/Makefile \ modules/webcomment/web/admin/Makefile \ modules/webhelp/Makefile \ modules/webhelp/web/Makefile \ modules/webhelp/web/admin/Makefile \ modules/webhelp/web/admin/howto/Makefile \ modules/webhelp/web/hacking/Makefile \ modules/webjournal/Makefile \ modules/webjournal/etc/Makefile \ modules/webjournal/doc/Makefile \ modules/webjournal/doc/admin/Makefile \ modules/webjournal/doc/hacking/Makefile \ modules/webjournal/lib/Makefile \ modules/webjournal/lib/elements/Makefile \ modules/webjournal/lib/widgets/Makefile \ modules/webjournal/web/Makefile \ modules/webjournal/web/admin/Makefile \ modules/webmessage/Makefile \ modules/webmessage/bin/Makefile \ modules/webmessage/bin/webmessageadmin \ modules/webmessage/doc/Makefile \ modules/webmessage/doc/admin/Makefile \ modules/webmessage/doc/hacking/Makefile \ modules/webmessage/lib/Makefile \ modules/webmessage/web/Makefile \ modules/websearch/Makefile \ modules/websearch/bin/Makefile \ modules/websearch/bin/webcoll \ modules/websearch/doc/Makefile \ modules/websearch/doc/admin/Makefile \ modules/websearch/doc/hacking/Makefile \ modules/websearch/lib/Makefile \ modules/websearch/web/Makefile \ modules/websearch/web/admin/Makefile \ modules/websession/Makefile \ modules/websession/bin/Makefile \ modules/websession/bin/inveniogc \ modules/websession/doc/Makefile \ modules/websession/doc/admin/Makefile \ modules/websession/doc/hacking/Makefile \ modules/websession/lib/Makefile \ modules/websession/web/Makefile \ modules/webstat/Makefile \ modules/webstat/bin/Makefile \ modules/webstat/bin/webstat \ modules/webstat/bin/webstatadmin \ modules/webstat/doc/Makefile \ modules/webstat/doc/admin/Makefile \ modules/webstat/doc/hacking/Makefile \ modules/webstat/etc/Makefile \ modules/webstat/lib/Makefile \ modules/webstyle/Makefile \ modules/webstyle/bin/Makefile \ modules/webstyle/bin/webdoc \ modules/webstyle/css/Makefile \ modules/webstyle/doc/Makefile \ modules/webstyle/doc/admin/Makefile \ modules/webstyle/doc/hacking/Makefile \ modules/webstyle/etc/Makefile \ modules/webstyle/img/Makefile \ modules/webstyle/lib/Makefile \ modules/websubmit/Makefile \ modules/websubmit/bin/Makefile \ modules/websubmit/bin/bibdocfile \ modules/websubmit/bin/inveniounoconv \ modules/websubmit/doc/Makefile \ modules/websubmit/doc/admin/Makefile \ modules/websubmit/doc/hacking/Makefile \ modules/websubmit/etc/Makefile \ modules/websubmit/lib/Makefile \ modules/websubmit/lib/functions/Makefile \ modules/websubmit/web/Makefile \ modules/websubmit/web/admin/Makefile \ ]) ## Finally, write output files: AC_OUTPUT ## Write help: AC_MSG_RESULT([****************************************************************************]) AC_MSG_RESULT([** Your Invenio installation is now ready for building. **]) AC_MSG_RESULT([** You have entered the following parameters: **]) AC_MSG_RESULT([** - Invenio main install directory: ${prefix}]) AC_MSG_RESULT([** - Python executable: $PYTHON]) AC_MSG_RESULT([** - MySQL client executable: $MYSQL]) AC_MSG_RESULT([** - CLISP executable: $CLISP]) AC_MSG_RESULT([** - CMUCL executable: $CMUCL]) AC_MSG_RESULT([** - SBCL executable: $SBCL]) AC_MSG_RESULT([** Here are the steps to continue the building process: **]) AC_MSG_RESULT([** 1) Type 'make' to build your Invenio system. **]) AC_MSG_RESULT([** 2) Type 'make install' to install your Invenio system. **]) AC_MSG_RESULT([** After that you can start customizing your installation as documented **]) AC_MSG_RESULT([** in the INSTALL file (i.e. edit invenio.conf, run inveniocfg, etc). **]) AC_MSG_RESULT([** Good luck, and thanks for choosing Invenio. **]) AC_MSG_RESULT([** -- Invenio Development Team **]) AC_MSG_RESULT([****************************************************************************]) ## end of file diff --git a/modules/bibauthorid/Makefile.am b/modules/bibauthorid/Makefile.am index cdaf33b34..bbb3f6dc0 100644 --- a/modules/bibauthorid/Makefile.am +++ b/modules/bibauthorid/Makefile.am @@ -1,21 +1,21 @@ ## ## This file is part of Invenio. ## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -SUBDIRS = bin doc lib web +SUBDIRS = bin doc lib web etc CLEANFILES = *~ diff --git a/modules/bibauthorid/Makefile.am b/modules/bibauthorid/etc/Makefile.am similarity index 94% copy from modules/bibauthorid/Makefile.am copy to modules/bibauthorid/etc/Makefile.am index cdaf33b34..f521ff166 100644 --- a/modules/bibauthorid/Makefile.am +++ b/modules/bibauthorid/etc/Makefile.am @@ -1,21 +1,18 @@ -## ## This file is part of Invenio. ## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -SUBDIRS = bin doc lib web - -CLEANFILES = *~ +SUBDIRS = name_authority_files diff --git a/modules/bibauthorid/Makefile.am b/modules/bibauthorid/etc/name_authority_files/Makefile.am similarity index 77% copy from modules/bibauthorid/Makefile.am copy to modules/bibauthorid/etc/name_authority_files/Makefile.am index cdaf33b34..6b936ff18 100644 --- a/modules/bibauthorid/Makefile.am +++ b/modules/bibauthorid/etc/name_authority_files/Makefile.am @@ -1,21 +1,24 @@ -## ## This file is part of Invenio. -## Copyright (C) 2011 CERN. +## Copyright (C) 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -SUBDIRS = bin doc lib web +etcdir = $(sysconfdir)/bibauthorid/name_authority_files + +etc_DATA = female_firstnames.txt male_firstnames.txt name_variants.txt + +EXTRA_DIST = $(etc_DATA) -CLEANFILES = *~ +CLEANFILES = *~ *.tmp diff --git a/modules/bibauthorid/lib/bibauthorid_authorname_utils.py b/modules/bibauthorid/lib/bibauthorid_authorname_utils.py index a8ce10f3e..10a302650 100644 --- a/modules/bibauthorid/lib/bibauthorid_authorname_utils.py +++ b/modules/bibauthorid/lib/bibauthorid_authorname_utils.py @@ -1,1029 +1,1354 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ''' bibauthorid_authornames_utils Helper for accessing the author names data structure ''' import bibauthorid_utils from copy import deepcopy from bibauthorid_utils import clean_name_string from bibauthorid_utils import split_name_parts import bibauthorid_structs as dat import bibauthorid_config as bconfig +from invenio.config import CFG_ETCDIR +import re + +try: + from editdist import distance +except ImportError: + try: + from Levenshtein import distance + except ImportError: + bconfig.LOGGER.exception("Levenshtein Module not available!") + if bconfig.AUTHORNAMES_UTILS_DEBUG: + print "Levenshtein Module not available!" + def distance(s1, s2): + d = {} + lenstr1 = len(s1) + lenstr2 = len(s2) + for i in xrange(-1, lenstr1 + 1): + d[(i, -1)] = i + 1 + for j in xrange(-1, lenstr2 + 1): + d[(-1, j)] = j + 1 + + for i in xrange(0, lenstr1): + for j in xrange(0, lenstr2): + if s1[i] == s2[j]: + cost = 0 + else: + cost = 1 + d[(i, j)] = min( + d[(i - 1, j)] + 1, # deletion + d[(i, j - 1)] + 1, # insertion + d[(i - 1, j - 1)] + cost, # substitution + ) + if i > 1 and j > 1 and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]: + d[(i, j)] = min (d[(i, j)], d[i - 2, j - 2] + cost) # transposition + return d[lenstr1 - 1, lenstr2 - 1] + + def get_bibrefs_by_authornames_id(authornames_id): ''' Finds actual ids of the author name as it appears in bib10x or bib70x @param authornames_id: id in aidAUTHORNAMES @return: A list of sets. - The first set in the list contains all ids in bib10x - The second set in the list contains all ids in bib70x @rtype: list of sets ''' bibrefs = '' bibref_str = [row['bibrefs'] for row in dat.AUTHOR_NAMES if row['id'] == authornames_id] if len(bibref_str) > 0: bibrefs = bibref_str.split(",") b100 = set() b700 = set() for bibref in bibrefs: tag, refid = bibref.split(':') if tag == "100": b100.add(int(refid)) elif tag == "700": b700.add(int(refid)) else: bconfig.LOGGER.error("Wrong bibref Tag...how did you do that?") return [b100, b700] def name_matching(orig_name, target_name): """ Checks the compatibility of the given names. @param orig_name: The original name String @type orig_name: string @param target_name: The target name string @type target_name: string @return: true or false in respect to the compatibility of the given names @rtype: boolean """ orig = bibauthorid_utils.split_name_parts(orig_name) targ = bibauthorid_utils.split_name_parts(target_name) if (len(orig[1]) == 0) or (len(targ[1]) == 0): return True else: initials_set = set(orig[1]) names_set = set(orig[2]) comp_initials_set = set(targ[1]) comp_names_set = set(targ[2]) names_intersection = names_set.intersection(comp_names_set) initials_intersection = initials_set.intersection(comp_initials_set) if len(initials_intersection) == 0: if len(names_intersection) != 0: bconfig.LOGGER.error("length of names intersection != 0..." "This should never happen!") if ((len(names_intersection) == 0) and (len(comp_names_set) > 0) and (len(names_set) > 0)): return False if orig[1][0] == targ[1][0]: return True return False def search_matching_names(authorname_string, match_function=name_matching, consider_surname_only=True): """ search for matching names give a matching function. @warning: searching for matching name with consider_surname_only=false will be painfully slow! You've been warned. @warning: for mental sanity purposes the surnames not ending with a comma are being ignored; if you're searching for a surname without comma or names, the comma is being added automatically to the end of the string. @param authorname_string: The author name string @type authorname_string: string @param match_function: The function to use for the name matching @type match_function: function descriptor @param consider_surname_only: Decides if only names with the same surname shall be considered or _all_ other names. @type consider_surname_only: boolean @return: an array containing a tuple @rtype: list of tuples @note: example: search_matching_names('einstein, albert') Out[7]: [[(962L, 'Einstein, Albert'), ['Einstein', ['A'], ['Albert']]], [(1128L, 'Einstein, A.'), ['Einstein', ['A'], []]]] """ possible_names = [] names = [] if authorname_string.count(',') == 0: authorname_string += ',' authorname = bibauthorid_utils.split_name_parts(authorname_string) if consider_surname_only: names = [row for row in dat.AUTHOR_NAMES if row['name'].startswith(authorname[0])] else: names = [row for row in dat.AUTHOR_NAMES] for name in names: if match_function(authorname_string, name['name']): possible_names.append([(name['id'], name['name']), bibauthorid_utils.split_name_parts(name['name'])]) return possible_names def get_name_id(name_string): """ @return: the id associated to a given string in the authornames table. Returns -1 if the string is not found. @return: int """ name_id = -1 name = [row['id'] for row in dat.AUTHOR_NAMES if row['name'] == name_string] try: name_id = name[0] except (IndexError, ValueError): name_id = -1 return name_id def get_name_string(authorname_id): ''' Get name representation for an ID in authornames table @return: the name string associated with a particular authorid in the authornames table. If the ID is not found returns an empty string. @rtype: string ''' name_string = "" name = [row['name'] for row in dat.AUTHOR_NAMES if row['id'] == authorname_id] if len(name) > 0: name_string = name[0] return name_string def get_db_name_string(authorname_id): ''' Get name representation for an ID in authornames table @return: the name string associated with a particular authorid in the authornames table. If the ID is not found returns an empty string. @rtype: string ''' name_string = "" name = [row['db_name'] for row in dat.AUTHOR_NAMES if row['id'] == authorname_id] if len(name) > 0: name_string = name[0] return name_string def get_name_and_db_name_strings(authorname_id): ''' Get name representation for an ID in authornames table @return: the name string and the db name string associated with a particular authornameid in the authornames table. If the ID is not found returns empty values for the dict keys. @rtype: dict ''' names_dict = {"name": "", "db_name": ""} name = [row for row in dat.AUTHOR_NAMES if row['id'] == authorname_id] if len(name) > 0: names_dict["name"] = name[0]['name'] names_dict["db_name"] = name[0]['db_name'] return names_dict def get_name_bibrefs(authorname_id): """ Finds the bibrefID from authorname_id. @param authorname_id: ID of the author name to look up the bibliographic reference for @type authorname_id: int @return: the bibrefs associated with a particular authorid in the authornames table. If the ID is not found, an empty string shall be returned. @rtype: string """ bibref_string = "" bibrefs = [row['bibrefs'] for row in dat.AUTHOR_NAMES if row['id'] == authorname_id] if len(bibrefs) > 0: bibref_string = bibrefs[0] return bibref_string def update_doclist(bibrec_id, authorname_id="", bibref=""): """ Update doclist table given bibrec_id and processed author. (inserts a new document in the doclist table) @return: True if a new bibrecord has been added, false if this bibrecord was previously processed @rtype: boolean """ records = [row for row in dat.DOC_LIST if row['bibrecid'] == bibrec_id] if len(records) > 0: # @note maybe it's better to have a comma-separated list in the # 'authorname_id' column. That would keep the DB size # lower. First steps for the implementation introduced; update # procedure necessary. Descision might be harder. # Performance tests might help. for record in records: refrec = (authorname_id, bibref) if ((authorname_id) and (authorname_id not in record['authornameids']) and (refrec not in record['authornameid_bibrefrec'])): record['authornameids'] += [authorname_id] record['authornameid_bibrefrec'] += [refrec] elif ((authorname_id) and (authorname_id in record['authornameids']) and (refrec not in record['authornameid_bibrefrec'])): record['authornameid_bibrefrec'] += [refrec] else: bconfig.LOGGER.warn("The author has already been processed on." " the record. That's OK. Skipping entry.") return False else: if authorname_id: refrec = (authorname_id, bibref) dat.DOC_LIST.append({'bibrecid': bibrec_id, 'authornameids': [authorname_id], 'authornameid_bibrefrec': [refrec]}) else: dat.DOC_LIST.append({'bibrecid': bibrec_id, 'authornameids': [], 'authornameid_bibrefrec': []}) return True def soft_compare_names(origin_name, target_name): ''' Soft comparison of names, to use in search engine an similar Base results: If surname is equal in [0.6,1.0] If surname similar in [0.4,0.8] If surname differs in [0.0,0.4] all depending on average compatibility of names and initials. ''' jaro_fctn = None try: from Levenshtein import jaro_winkler jaro_fctn = jaro_winkler except ImportError: jaro_fctn = jaro_winkler_str_similarity score = 0.0 oname = deepcopy(origin_name) tname = deepcopy(target_name) orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if orig_name[0] == targ_name[0]: score += 0.6 else: if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): score += 0.0 else: score += 0.4 if orig_name[1] and targ_name[1]: max_initials = max(len(orig_name[1]), len(targ_name[1])) matching_i = 0 if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1: for i in orig_name[1]: if i in targ_name[1]: matching_i += 1 max_names = max(len(orig_name[2]), len(targ_name[2])) matching_n = 0 if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1: cleaned_targ_name = [clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2]] for i in orig_name[2]: if clean_name_string(i, replacement="", keep_whitespace=False) in cleaned_targ_name: matching_n += 1 name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials) score += name_score return score -def compare_names(origin_name, target_name): +def compare_names_old(origin_name, target_name): """ + DEPRECATED: this was used by earlier versions of the algorithm. Left in the code for + testing purposes. + Compute an index of confidence that would like to indicate whether two names might represent the same person.The computation is based on similarities of name structure, in particular: Initials: We assign an high score if all the initials matches are in the right order, much lower if they are in the wrong order Names: We assign a lower score for mismatching names and higher score for fully matching names If there is nothing to compare we are forced to assume a high score. Example for splitting names: In : bibauthorid.split_name_parts("Ellis, John R") Out: ['Ellis', ['J', 'R'], ['John']] Ellis, R. Keith => [ [Ellis], [R, K], [Keith] ] Ellis, Richard Keith => [ [Ellis], [R, K], [Richard, Keith] ] Since the initials are computed whether on the real initials present in the name string and using the full name, if there is no initials match we are 1 00% confident that: 1. we have no names/initials at all, or 2. we have completely different names; hence if there is no initial match we skip this step. @param orig_name: The first author's last name, first name(s) and initial @type orig_name: list of strings and lists of strings @param targ_name: The second author's last name, first name(s) and initial @type targ_name: list of strings and lists of strings @return: a value that describes the likelihood of the names being the same @rtype: float """ jaro_fctn = None try: from Levenshtein import jaro_winkler jaro_fctn = jaro_winkler except ImportError: jaro_fctn = jaro_winkler_str_similarity oname = deepcopy(origin_name) tname = deepcopy(target_name) orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) bconfig.LOGGER.info("|--> Comparing Names: \"%s\" and \"%s\"" % (origin_name, target_name)) lastname_modifier = 0.0 if not (orig_name[0] == targ_name[0]): # last names are not equal before cleaning them. Assign entry penalty. lastname_modifier = 0.15 orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if not (orig_name[0] == targ_name[0]): if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): bconfig.LOGGER.warn(("Unequal lastnames(%s vs. %s)." + "Skipping Comparison") % (orig_name[0], targ_name[0])) return 0.0 else: bconfig.LOGGER.log(25, "Last names are not equal; " + "but similar enough to continue the comparison") # Let it go through...however, reduce the final result a little. lastname_modifier = 0.24 else: # last names are equal after cleaning them. Reduce penalty. if lastname_modifier == 0.15: lastname_modifier = 0.02 if orig_name[2] and targ_name[2]: if len(orig_name[2]) > 1 or len(targ_name[2]) > 1: variation_ps = [] oname_variations = create_name_tuples(orig_name[2]) tname_variations = create_name_tuples(targ_name[2]) for oname_variation in oname_variations: for tname_variation in tname_variations: oname_var = split_name_parts("%s, %s" % (orig_name[0], oname_variation)) tname_var = split_name_parts("%s, %s" % (targ_name[0], tname_variation)) variation_ps.append(_perform_matching(oname_var, tname_var)) return max(variation_ps) - lastname_modifier return _perform_matching(orig_name, targ_name) - lastname_modifier def _perform_matching(orig_name, targ_name): ''' @param orig_name: @type orig_name: @param targ_name: @type targ_name: ''' tname = deepcopy(targ_name) oname = deepcopy(orig_name) potential_name_matches = min(len(oname[2]), len(tname[2])) names_p_weight = 0.0 initials_p_weight = _compare_initials(oname, tname) if initials_p_weight > 0.0: names_p_weight = _compare_first_names(oname, tname) names_w = .5 ini_w = .5 if (names_p_weight > 0.6) and (potential_name_matches > 0): names_w = .7 ini_w = .3 if (initials_p_weight == 1.0) and (len(oname[1]) != len(tname[1])): initials_p_weight -= .1 if (names_p_weight == 1.0) and ((len(oname[2]) != len(tname[2])) or not len(oname[2])) and (potential_name_matches < 2): names_p_weight -= .1 if (initials_p_weight == 1.0) and (names_p_weight <= 0): names_w = 0. ini_w = 0. res = names_w * names_p_weight + ini_w * initials_p_weight # print "|--> Comparing Names: %s and %s" % (oname, tname) bconfig.LOGGER.debug(("|---> iWeight (%s) * ip (%s) + nWeight " + "(%s) * nP (%s) = %s") % (ini_w, initials_p_weight, names_w, names_p_weight, res)) return (names_w * names_p_weight + ini_w * initials_p_weight) def _compare_initials(orig_name, targ_name): ''' Compares Author's initials and returns the assigned score. @param orig_name: The first author's last name, first name(s) and initial @type orig_name: list of strings and lists of strings @param targ_name: The second author's last name, first name(s) and initial @type targ_name: list of strings and lists of strings @return: a value describing the likelihood of the initials being the same @rtype: float ''' # determine minimal number of initials and declare the # count of max. possible matches tname = deepcopy(targ_name) oname = deepcopy(orig_name) max_possible_matches = min(len(oname[1]), len(tname[1])) initial_weight_denominator = (float(1 + max_possible_matches) / 2.0) * max_possible_matches initials_p_weight = 0.0 if max_possible_matches > 0: for index, item in enumerate(oname[1]): # print "|---> Trying Initial: ", I if index < max_possible_matches: try: targ_index = tname[1].index(item) if index == targ_index: initials_p_weight += ( float(index + 1) / initial_weight_denominator) else: initials_p_weight += 1. / (5 * max_possible_matches * abs(index - targ_index)) tname[1][targ_index] = '' except (IndexError, ValueError, ZeroDivisionError): # initials_p_weight = 0.1 break else: initials_p_weight = 0.0 return initials_p_weight def _compare_first_names(orig_name, targ_name): ''' Compares Author's first names and returns the assigned score. @param orig_name: The first author's last name, first name(s) and initial @type orig_name: list of strings and lists of strings @param targ_name: The second author's last name, first name(s) and initial @type targ_name: list of strings and lists of strings @return: a value that describes the likelihood of the names being the same @rtype: float ''' # determine minimal number of names and declare the # count of max. possible matches string_similarity = None try: from Levenshtein import jaro_winkler string_similarity = jaro_winkler except ImportError: string_similarity = jaro_winkler_str_similarity tname = deepcopy(targ_name) oname = deepcopy(orig_name) names_p_weight = 0.0 max_possible_matches = float(min(len(oname[2]), len(tname[2]))) name_weight_denominator = ((1.0 + max_possible_matches) / 2.0 * max_possible_matches) equal_set = set(oname[2]).intersection(set(tname[2])) equal_names = [i for i in oname[2] if i in equal_set] if max_possible_matches < 1.: return 1.0 if len(equal_names) == max_possible_matches: for index, item in enumerate(equal_names): if index <= max_possible_matches: try: targ_index = tname[2].index(item) initial_index = oname[1].index(item[0].upper()) if (index == targ_index) or (initial_index == targ_index): names_p_weight += (float(index + 1) / float(name_weight_denominator)) else: names_p_weight += 1. / (2 * max_possible_matches * abs(index - targ_index)) tname[2][targ_index] = '' except (IndexError, ValueError, ZeroDivisionError): break else: fuzzy_matches = 0 wrong_position_modifier = 0 # for name1 in oname[2]: # for name2 in tname[2]: # similarity = string_similarity(name1, name2) # if similarity > 0.91: # fuzzy_matches += 1 # if oname[2].index(name1) != tname[2].index(name2): # wrong_position_modifier += 0.05 for name1 in oname[2]: for name2 in tname[2]: fuzzy_matches += string_similarity(name1, name2) if oname[2].index(name1) != tname[2].index(name2): wrong_position_modifier += 0.05 if fuzzy_matches > 0: num_comparisons = len(oname[2]) * len(tname[2]) names_p_weight = (fuzzy_matches / num_comparisons - wrong_position_modifier) else: names_p_weight = -0.3 return names_p_weight def create_name_tuples(names): ''' Find name combinations, i.e. permutations of the names in different positions of the name @param names: a list of names @type names: list of string @return: the combinations of the names given @rtype: list of lists of strings ''' length = float(len(names)) max_tuples = int((length / 2) * (length - 1)) current_tuple = 1 pos = 0 off = 1 variants = [" ".join(names)] for i in range(max_tuples): variant = "%s %s %s" % (' '.join(names[0:pos]), ''.join(names[pos:off + 1]).capitalize(), ' '.join(names[off + 1::])) variants.append(variant.strip()) pos += 1 off += 1 if off >= length: pos = i * 0 off = current_tuple + 1 current_tuple += 1 return variants def jaro_str_distance(str1, str2): """ The Jaro string similarity algorithm as described in 'Jaro, M.A. (1989): "Advances in record linkage methodology as applied to the 1985 census of Tampa Florida' @param str1: The first string @type str1: string @param str2: The second string @type str2: string @return: approximate string comparison measure (between 0.0 and 1.0) @rtype: float """ if (not str1) or (not str2): return 0.0 elif str1 == str2: return 1.0 jaro_marker = chr(1) len1 = len(str1) len2 = len(str2) halflen = max(len1, len2) / 2 + 1 assignments1 = '' assignments2 = '' workstr1 = str1 workstr2 = str2 common1 = common2 = 0 # Analyze the first string for i in xrange(len1): start = max(0, i - halflen) end = min(i + halflen + 1, len2) index = workstr2.find(str1[i], start, end) if index > -1: # Found common character common1 += 1 assignments1 = assignments1 + str1[i] workstr2 = workstr2[:index] + jaro_marker + workstr2[index + 1:] # Analyze the second string for i in xrange(len2): start = max(0, i - halflen) end = min(i + halflen + 1, len1) index = workstr1.find(str2[i], start, end) if (index > -1): # Found common character common2 += 1 assignments2 = assignments2 + str2[i] workstr1 = workstr1[:index] + jaro_marker + workstr1[index + 1:] common = float(common1 + common2) / 2.0 if (common == 0): return 0.0 transpositions = 0 for i in xrange(len(assignments1)): if (assignments1[i] != assignments2[i]): transpositions += 1 transpositions /= 2.0 common = float(common) len1 = float(len1) len2 = float(len2) jaro_constant = 1.0 / 3.0 jaro_transpositions = (common1 - transpositions) / common1 jaro_common_to_len_ratio = common1 / len1 + common1 / len2 dist = jaro_constant * (jaro_common_to_len_ratio + jaro_transpositions) return dist def _winkler_modifier(str1, str2, in_weight): """ Applies the winkler modifier to a score obtained by the Jaro string similarity measure. This is described in Winkler, W.E. (1999) "The state of record linkage and current research problems". If the first characters of the two strings (up to first 4) are identical, the similarity weight will be increased. @param str1: The first string @type str1: string @param str2: The second string @type str2: string @param in_weight: Similarity score obtained by the Jaro algorithm @type in_weight: float @return: approximate string comparison measure (between 0.0 and 1.0) @rtype: float """ if (not str1) or (not str2): return 0.0 elif str1 == str2: return 1.0 # Compute how many characters are common at beginning minlen = min(len(str1), len(str2)) common_chars_num = 0 for common_chars_num in xrange(1, minlen + 1): if str1[:common_chars_num] != str2[:common_chars_num]: break common_chars_num -= 1 if (common_chars_num > 4): common_chars_num = 4 winkler_weight = in_weight + common_chars_num * 0.1 * (1.0 - in_weight) final_result = 0.0 if winkler_weight >= 0.0 and winkler_weight <= 1.0: final_result = winkler_weight elif winkler_weight > 1.0: final_result = 1.0 return final_result def jaro_winkler_str_similarity(str1, str2): """ For backwards compatibility, call Jaro followed by Winkler modification. @param str1: The first string @type str1: string @param str2: The second string @type str2: string @return: approximate string comparison measure (between 0.0 and 1.0) @rtype: float """ jaro_weight = jaro_str_distance(str1, str2) return _winkler_modifier(str1, str2, jaro_weight) -def names_are_equal_composites(name1, name2): +def full_names_are_equal_composites(name1, name2): ''' Checks if names are equal composites; e.g. "guangsheng" vs. "guang sheng" - @param name1: Name string of the first name (w/ last name) + @param name1: Full Name string of the first name (w/ last name) @type name1: string - @param name2: Name string of the second name (w/ last name) + @param name2: Full Name string of the second name (w/ last name) @type name2: string @return: Are the names equal composites? @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) is_equal_composite = False oname_variations = create_name_tuples(name1[2]) tname_variations = create_name_tuples(name2[2]) for oname_variation in oname_variations: for tname_variation in tname_variations: oname = clean_name_string(oname_variation.lower(), "", False, True) tname = clean_name_string(tname_variation.lower(), "", False, True) if oname == tname: is_equal_composite = True break return is_equal_composite def names_are_equal_gender(name1, name2, gendernames): ''' - Checks on gender equality of two names baes on a word list + Checks if names have the same gender + @param gendernames: dictionary male/female names + ''' + g1 = [name1 in gendernames['boys'], name1 in gendernames['girls']] + g2 = [name2 in gendernames['boys'], name2 in gendernames['girls']] - @param name1: Name string of the first name (w/ last name) + if (g1[0] == g2[0] == True) and (g1[1] == False or g2[1] == False): + return True + if (g1[1] == g2[1] == True) and (g1[0] == False or g2[0] == False): + return True + return False + +def full_names_are_equal_gender(name1, name2, gendernames): + ''' + Checks on gender equality of two first names baes on a word list + + @param name1: Full Name string of the first name (w/ last name) @type name1: string - @param name2: Name string of the second name (w/ last name) + @param name2: Full Name string of the second name (w/ last name) @type name2: string @param gendernames: dictionary of male/female names @type gendernames: dict @return: Are names gender-equal? @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) - print_debug = False names_are_equal_gender_b = True ogender = None tgender = None - oname = name1[2][0].lower() - tname = name2[2][0].lower() - oname = clean_name_string(oname, "", False, True) - tname = clean_name_string(tname, "", False, True) - - if oname in gendernames['boys']: - ogender = 'Male' - elif oname in gendernames['girls']: - ogender = 'Female' - - if tname in gendernames['boys']: - tgender = 'Male' - elif tname in gendernames['girls']: - tgender = 'Female' - - if print_debug: - print ' Gender check: ', oname, ' is a ', ogender - print ' Gender check: ', tname, ' is a ', tgender +# oname = name1[2][0].lower() +# tname = name2[2][0].lower() +# oname = clean_name_string(oname, "", False, True) +# tname = clean_name_string(tname, "", False, True) + + onames = [clean_name_string(n.lower(), "", False, True) for n in name1[2]] + tnames = [clean_name_string(n.lower(), "", False, True) for n in name2[2]] + + for oname in onames: + if oname in gendernames['boys']: + if ogender != 'Conflict': + if ogender != 'Female': + ogender = 'Male' + else: + ogender = 'Conflict' + elif oname in gendernames['girls']: + if ogender != 'Conflict': + if ogender != 'Male': + ogender = 'Female' + else: + ogender = 'Conflict' + + for tname in tnames: + if tname in gendernames['boys']: + if tgender != 'Conflict': + if tgender != 'Female': + tgender = 'Male' + else: + tgender = 'Conflict' + elif tname in gendernames['girls']: + if tgender != 'Conflict': + if tgender != 'Male': + tgender = 'Female' + else: + tgender = 'Conflict' if ogender and tgender: - if ogender != tgender: - if print_debug: - print ' Gender differs, force split!' + if ogender != tgender or ogender == 'Conflict' or tgender == 'Conflict': names_are_equal_gender_b = False return names_are_equal_gender_b def names_are_synonymous(name1, name2, name_variations): + ''' + Checks if names are synonims + @param name_variations: name variations list + @type name_variations: list of lists + ''' + + a = [name1 in nvar and name2 in nvar for nvar in name_variations] + if True in a: + return True + return False + +def full_names_are_synonymous(name1, name2, name_variations): ''' Checks if two names are synonymous; e.g. "Robert" vs. "Bob" - @param name1: Name string of the first name (w/ last name) + @param name1: Full Name string of the first name (w/ last name) @type name1: string - @param name2: Name string of the second name (w/ last name) + @param name2: Full Name string of the second name (w/ last name) @type name2: string @param name_variations: name variations list @type name_variations: list of lists @return: are names synonymous @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) print_debug = False names_are_synonymous_b = False max_matches = min(len(name1[2]), len(name2[2])) matches = [] for i in xrange(max_matches): matches.append(False) for nvar in name_variations: for i in xrange(max_matches): oname = name1[2][i].lower() tname = name2[2][i].lower() oname = clean_name_string(oname, "", False, True) tname = clean_name_string(tname, "", False, True) if oname in nvar and tname in nvar: if print_debug: - print ' ', oname, ' and ', tname, ' are synonyms! Not splitting!' + print ' ', oname, ' and ', tname, ' are synonyms!' matches[i] = True if sum(matches) == max_matches: names_are_synonymous_b = True break return names_are_synonymous_b def names_are_substrings(name1, name2): + ''' + Checks if the names are subtrings of each other, left to right + @return: bool + ''' + return name1.startswith(name2) or name2.startswith(name1) + +def full_names_are_substrings(name1, name2): ''' Checks if two names are substrings of each other; e.g. "Christoph" vs. "Ch" Only checks for the beginning of the names. - @param name1: Name string of the first name (w/ last name) + @param name1: Full Name string of the first name (w/ last name) @type name1: string - @param name2: Name string of the second name (w/ last name) + @param name2: Full Name string of the second name (w/ last name) @type name2: string @return: are names synonymous @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) onames = name1[2] tnames = name2[2] # oname = "".join(onames).lower() # tname = "".join(tnames).lower() - oname = clean_name_string("".join(onames).lower(), "", False, True) - tname = clean_name_string("".join(tnames).lower(), "", False, True) - names_are_substrings_b = False - if (oname.startswith(tname) - or tname.startswith(oname)): - names_are_substrings_b = True + names_are_substrings_b = False + for o in onames: + oname = clean_name_string(o.lower(), "", False, True) + for t in tnames: + tname = clean_name_string(t.lower(), "", False, True) + if (oname.startswith(tname) + or tname.startswith(oname)): + names_are_substrings_b = True return names_are_substrings_b -def names_minimum_levenshtein_distance(name1, name2): +def names_levenshtein_distance(name1, name2): + ''' + Returns the levenshtein distance between two strings + TODO: improve to give more sensed results in case of synonim names? + ''' + return distance(name1, name2) + +def full_names_minimum_levenshtein_distance(name1, name2): ''' Determines the minimum distance D between two names. Comparison is base on the minimum number of first names. Examples: D("guang", "guang sheng") = 0 D("guang", "guangsheng") = 5 D("guang sheng", "guangsheng") = 5 D("guang sheng", "guang shing") = 1 D("guang ming", "guang fin") = 2 @precondition: Names have been checked for composition equality. - @param name1: Name string of the first name (w/ last name) + @param name1: Name string of the first name (w/ last name), force split @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string - +weather @return: the minimum Levenshtein distance between two names @rtype: int ''' - try: - from Levenshtein import distance - except ImportError: - bconfig.LOGGER.exception("Levenshtein Module not available!") - return - 1 if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) onames = name1[2] tnames = name2[2] # min_names_count = min(len(onames), len(tnames)) # # if min_names_count <= 0: # return -1 # # oname = "".join(onames[:min_names_count]).lower() # tname = "".join(tnames[:min_names_count]).lower() oname = clean_name_string("".join(onames).lower(), "", False, True) tname = clean_name_string("".join(tnames).lower(), "", False, True) return distance(oname, tname) + +def _load_gender_firstnames_dict(files=''): + if not files: + files = {'boy': CFG_ETCDIR + '/bibauthorid/name_authority_files/male_firstnames.txt', + 'girl': CFG_ETCDIR + '/bibauthorid/name_authority_files/female_firstnames.txt'} + + boyf = open(files['boy'], 'r') + boyn = [x.strip().lower() for x in boyf.readlines()] + boyf.close() + girlf = open(files['girl'], 'r') + girln = [x.strip().lower() for x in girlf.readlines()] + girlf.close() + return {'boys':boyn, 'girls':girln} + + +def _load_firstname_variations(filename=''): + #will load an array of arrays: [['rick','richard','dick'],['john','jhonny']] + if not filename: + filename = CFG_ETCDIR + '/bibauthorid/name_authority_files/name_variants.txt' + retval = [] + r = re.compile("\n") + fp = open(filename) + + for l in fp.readlines(): + lr = r.sub("", l) + retval.append([clean_name_string(name.lower(), "", False, True) + for name in lr.split(";") if name]) + + fp.close() + + return retval + +def compare_names(origin_name, target_name): + ''' + Compare two names. + mode can be: + -float: returns a value in range [0,1] to guess name equality + -full: returns [surname_distance, names_distance, names_are_substring, names_are_synonim, + same_gender, names_equal_upon_composition, exactly_same_initials, + initials_have_intersection] + ''' + AUTHORNAMES_UTILS_DEBUG = bconfig.AUTHORNAMES_UTILS_DEBUG + MAX_ALLOWED_SURNAME_DISTANCE = 2 + if AUTHORNAMES_UTILS_DEBUG: + print "\nComparing: " , origin_name, ' ', target_name + gendernames = GLOBAL_gendernames + name_variations = GLOBAL_name_variations + no = split_name_parts(origin_name) + nt = split_name_parts(target_name) + + if AUTHORNAMES_UTILS_DEBUG: + print "|- splitted no: ", no + print "|- splitted nt: ", nt + + score = 0.0 + + surname_dist = distance(no[0], nt[0]) + if AUTHORNAMES_UTILS_DEBUG: + print "|- surname distance: ", surname_dist + if surname_dist > 0: + score = max(0.0, 0.5 - (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE))) + else: + score = 1 + if AUTHORNAMES_UTILS_DEBUG: + print '||- surname score: ', score + + initials_only = ((min(len(no[2]), len(nt[2]))) == 0) + if AUTHORNAMES_UTILS_DEBUG: + print '|- initials only: ', initials_only + + names_are_equal_composites = False + if not initials_only: + names_are_equal_composites = full_names_are_equal_composites(origin_name, target_name) + if AUTHORNAMES_UTILS_DEBUG: + print "|- equal composites: ", names_are_equal_composites + + max_n_initials = max_n_initials = max(len(no[1]), len(nt[1])) + initials_intersection = set(no[1]).intersection(set(nt[1])) + n_initials_intersection = len(initials_intersection) + initials_union = set(no[1]).union(set(nt[1])) + n_initials_union = len(initials_union) + + + initials_distance = distance("".join(no[1]), "".join(nt[1])) + if n_initials_union > 0: + initials_c = float(n_initials_intersection) / float(n_initials_union) + else: + initials_c = 1 + + if len(no[1]) > len(nt[1]): + alo = no[1] + alt = nt[1] + else: + alo = nt[1] + alt = no[1] + lo = len(alo) + lt = len(alt) + if max_n_initials > 0: + initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo)) + if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \ + float(float(max_n_initials * (max_n_initials + 1)) / 2) + initials_distance = initials_distance / max_n_initials + else: + initials_screwup = 0 + initials_distance = 0 + + score = score - (0.75 * initials_screwup + 0.10 * (1 - initials_c)\ + + 0.15 * initials_distance) * (score) + if AUTHORNAMES_UTILS_DEBUG: + print "|- initials sets: ", no[1], " ", nt[1] + print "|- initials distance: ", initials_distance + print "|- initials c: ", initials_c + print "|- initials screwup: ", initials_screwup + print "||- initials score: ", score + + composits_eq = full_names_are_equal_composites(no, nt) + if len(no[2]) > 0 and len(nt[2]) > 0: + gender_eq = full_names_are_equal_gender(no, nt, gendernames) + else: + gender_eq = True + vars_eq = full_names_are_synonymous(no, nt, name_variations) + substr_eq = full_names_are_substrings(no, nt) + + if not initials_only: + if len(no[2]) > len(nt[2]): + nalo = no[2] + nalt = nt[2] + else: + nalo = nt[2] + nalt = no[2] + nlo = len(nalo) + nlt = len(nalt) + names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i]))) + for i, k in enumerate(reversed(nalo)) \ + if nlo - 1 - i < nlt] + max_names_screwup = max([float(i[0]) / i[1] for i in names_screwup_list]) + avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\ + / len(names_screwup_list) + + else: + max_names_screwup = 0 + avg_names_screwup = 0 + + score = score - score * 0.75 * max_names_screwup - score * 0.25 * avg_names_screwup + if AUTHORNAMES_UTILS_DEBUG: + print "|- max names screwup: ", max_names_screwup + print "|- avg screwup: ", avg_names_screwup + print "||- names score: ", score + print "|- names composites: ", composits_eq + print "|- same gender: ", gender_eq + print "|- synonims: ", vars_eq + print "|- substrings: ", substr_eq + + if vars_eq: + synmap = [[i, j, names_are_synonymous(i, j, name_variations)] for i in no[2] for j in nt[2]] + synmap = [i for i in synmap if i[2] == True] + if AUTHORNAMES_UTILS_DEBUG: + print "|-- synmap: ", synmap + for i in synmap: + if no[2].index(i[0]) == nt[2].index(i[1]): + score = score + (1 - score) * 0.5 + else: + score = score + (1 - score) * 0.15 + else: + if AUTHORNAMES_UTILS_DEBUG: + print "|-- synmap: empty" + if AUTHORNAMES_UTILS_DEBUG: + print "|-- synmap score: ", score + + if substr_eq and not initials_only: + ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2] for j in nt[2]] + ssmap = [i for i in ssmap if i[2] == True] + if AUTHORNAMES_UTILS_DEBUG: + print "|-- substr map: ", ssmap + for i in ssmap: + if no[2].index(i[0]) == nt[2].index(i[1]): + score = score + (1 - score) * 0.2 + else: + score = score + (1 - score) * 0.05 + else: + if AUTHORNAMES_UTILS_DEBUG: + print "|-- substr map: empty" + + if AUTHORNAMES_UTILS_DEBUG: + print "|-- substring score: ", score + + if composits_eq and not initials_only: + if AUTHORNAMES_UTILS_DEBUG: + print "|-- composite names" + score = score + (1 - score) * 0.2 + else: + if AUTHORNAMES_UTILS_DEBUG: + print "|-- not composite names" + if AUTHORNAMES_UTILS_DEBUG: + print "|-- composite score: ", score + + if not gender_eq: + score = score / 3. + if AUTHORNAMES_UTILS_DEBUG: + print "|-- apply gender penalty" + else: + if AUTHORNAMES_UTILS_DEBUG: + print "|-- no gender penalty" + + if AUTHORNAMES_UTILS_DEBUG: + print "|-- gender score: ", score + + if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE: + score = 0.0 + if AUTHORNAMES_UTILS_DEBUG: + print "|- surname trim: ", score + else: + if AUTHORNAMES_UTILS_DEBUG: + print "|- no surname trim: ", score + if AUTHORNAMES_UTILS_DEBUG: + print "||- final score: ", score + + + return score + +GLOBAL_gendernames = _load_gender_firstnames_dict() +GLOBAL_name_variations = _load_firstname_variations() + + + + + + + + + + + + diff --git a/modules/bibauthorid/lib/bibauthorid_cli.py b/modules/bibauthorid/lib/bibauthorid_cli.py index 5e7dc7e5d..3ee841575 100644 --- a/modules/bibauthorid/lib/bibauthorid_cli.py +++ b/modules/bibauthorid/lib/bibauthorid_cli.py @@ -1,307 +1,310 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ bibauthorid_cli This module provides a command-line interface for BibAuthorID. """ import getopt import sys import time import os.path as osp import bibauthorid_config as bconfig import bibauthorid as engine import bibauthorid_structs as dat from bibauthorid_file_utils import populate_structs_from_files from bibauthorid_file_utils import write_mem_cache_to_files from bibauthorid_file_utils import make_directory #log = bconfig.get_logger("bibauthor.cli") def main(): """Main function """ arguments = sys.argv if len(arguments) <= 1: bconfig.LOGGER.error("Please provide parameters!") _display_help() run_daemon = True standalone_option = ("-S", "--standalone", "-j", "--job-dir") for option in standalone_option: for arg in arguments: if arg.startswith(option): run_daemon = False if run_daemon: daemon = None try: import bibauthorid_daemon as daemon except ImportError: bconfig.LOGGER.error("Hmm...No Daemon process running.") if daemon: daemon.bibauthorid_daemon() else: options = _read_options(arguments) if options["job_dir"]: job_dir = options["job_dir"] if job_dir.endswith("/"): job_dir = job_dir[0:-1] log_name = osp.abspath(job_dir).split("/") logfile = "%s/%s.log" % (job_dir, log_name[-1]) start = time.time() bconfig.init_logger(logfile) populate_structs_from_files(job_dir) bconfig.LOGGER.debug("| Loaded %s records." % len(dat.RELEVANT_RECORDS)) engine.start_computation(process_doclist=True, process_orphans=True, print_stats=True) result_path = "%s/results/" % (job_dir,) if make_directory(result_path): write_mem_cache_to_files(result_path, is_result=True) else: bconfig.LOGGER.error("Cannot write to destination: " "Cannot create directory") end = time.time() - start bconfig.LOGGER.log(25, "Finish! The computation finished in %.2fs" % (end)) bconfig.stop_and_close_logger() else: bconfig.LOGGER.error("Standalone mode without parameters " "does not do anything helpful. Please" "consult -h help message for usage") def _display_help(): """Prints the help message for this module.""" print """Usage: bibauthorid [OPTIONS] Runs the author disambiguation and identity matching algorithm. General options: -h, --help Display this help and exit -V, --version Output version information and exit -v, --verbose=LEVEL Number between 1 and 50. The higher the number, the higher the level of importance. Everything below the number will be ignored. Equal and above will be shovn. Debugging=10, Info=20, Bibauthorid default log=25, Warnings=30, Errors=40] -S, --standalone Switches on stand alone mode. This is required for jobs that should run on a set of files rather than on the database (e.g. this is needed on the grid). Without this switch no standalone job will start or perform. Daemon mode options: Commands: NOTE: Options -n, -a, -U, -G and -R are mutually exclusive (XOR)! -n, --lastname=STRING Process only authors with this last name. -a, --process-all The option for cleaning all authors. -U, --update-universe Update bibauthorid universe. Find modified and newly entered records and process all the authors on these records. -G, --prepare-grid Prepares a set of files that supply the pre-clustered data needed for stand alone job to run (e.g. needed on the grid). The behavior of this export can be controlled with the options -d (required), -p and -m (both optional). -R, --load-grid-results Loads the results from the grid jobs and writes them to the database. The behavior of this import can be controlled with the options -d (required). --update-cache Updates caches to the newly introduced changes (new and modified documents). This should be called daily or better more then once per day, to ensure the correct operation of the frontend (and the backend). --clean-cache Clean the cache from out of date contents (deleted documents). --repair-personid Deletes untouched person entities to then re-create and updated these entities. --fast-update-personid Updates personid adding not yet assigned papers to the system, - in a fast, best effort basis. + in a fast, best effort basis. -r to specify records. + --personid-gc Runs the gc on personid. -r to specify records. Options: -r, --record-ids=NUM Specifies a list of record ids. To use as on option for --update-universe to limit the update to the selected records. Must be space less CSVs. --all-records To use as on option for --update-universe to perform the update an all existing record ids. Be WARNED that this will empty and re-fill all aid* tables in the process! -d, --data-dir=DIRNAME Specifies the data directory, in which the data for the grid preparation will be stored to or loaded from. It requires the -G or -R switch. -p, --prefix=STRING Specifies the prefix of the directories created under the --data-dir directory. Optional. Defaults to 'job'. It requires the -G switch. -m, --max-records=NUM Specifies the number of records that shall be stored per job package. Optional. Defaults to 4000 and requires -G switch. Standalone mode options: -j, --job-dir=DIRECTORY Run the job on the files found under the path specified here. Supplying a directory is mandatory. The results of the process will be stored in a sub directory of --job-dir named 'results'. These results can be loaded into the db with the -R option of this command line tool. Examples (daemon mode): - Process all records that hold an author with last name 'Ellis': $ bibauthorid -u admin --lastname "Ellis" - Process all records and regard all authors: $ bibauthorid -u admin --process-all - To update all information from newly entered and modified records: $ bibauthorid -u admin -U - Prepare job packages in folder 'gridfiles' with the sub directories prefixed with 'task' and a maximum number of 2000 records per package: $ bibauthorid -u admin --prepare-grid -d gridfiles -p task -m 2000 Examples (standalone mode): - Process the job package stored in folder 'grid_data/job0' $ bibauthorid -S --job-dir=grid_data/job0 """ sys.exit(1) def _display_version(): """Display Bibauthorid version and exit.""" try: from invenio.config import CFG_VERSION print "\nInvenio/%s bibauthorid v%s\n" % (CFG_VERSION, bconfig.VERSION) except ImportError: print "\nInvenio bibauthorid (standalone) v%s\n" % (bconfig.VERSION) sys.exit(1) def _read_options(options_string): """Reads the options, test if the specified values are consistent and populates the options dictionary.""" options = { "lastname": "None,", "do_all": False, "output_limit": 20, "prepare_grid": False, "prefix": "job", "data_dir": "data_dir", "standalone": False, "job_dir": False, "max_records": 4000, "load_grid_results": False, "update": False, "update_cache": False, "clean_cache": False, "record_ids" : None, "all_records": False, "repair_pid": False, - "fast_update_personid":False + "fast_update_personid": False, + "personid_gc": False } try: short_flags = "r:n:v:i:d:p:j:m:USGRahV" long_flags = ["lastname=", "verbose=", "recid=", "process-all", "help", "version", "prepare-grid", "prefix=", "data-dir=", "standalone", "job-dir=", "max-records=", "load-grid-results", "update-universe", "update-cache", "clean-cache", "record-ids=", "all-records", "repair-personid", - "fast-update-personid="] + "fast-update-personid", "personid-gc"] opts, args = getopt.gnu_getopt(options_string, short_flags, long_flags) except getopt.GetoptError, err1: print >> sys.stderr, "Parameter problem: %s" % err1 _display_help() # 2 dictionaries containing the option linked to its destination in the # options dictionary. with_argument = { "-n": "lastname", "--lastname": "lastname", "-d": "data_dir", "--data-dir": "data_dir", "-p": "prefix", "--prefix": "prefix", "-j": "job_dir", "--job-dir": "job_dir", "-m": "max_records", "--max-records": "max_records", "--record-ids": "record_ids", - "-r": "record_ids", - "--fast-update-personid":"fast_update_personid" + "-r": "record_ids" } without_argument = { "-a": "do_all", "--process-all": "do_all", "-U": "update", "--update-universe": "update", "-G": "prepare_grid", "--prepare-grid": "prepare_grid", "-S": "standalone", "--standalone": "standalone", "-R": "load_grid_results", "--load-grid-results": "load_grid_results", "--update-cache": "update_cache", "--clean-cache": "clean_cache", "--all-records": "all_records", - "--repair-personid": "repair_pid" + "--repair-personid": "repair_pid", + "--fast-update-personid":"fast_update_personid", + "--personid-gc":"personid_gc" } for option, argument in opts: if option in ("-h", "--help"): _display_help() elif option in ("-V", "--version"): _display_version() elif option in ("-v", "--verbose"): bconfig.LOG_LEVEL = int(argument) elif option in with_argument: options[with_argument[option]] = argument elif option in without_argument: options[without_argument[option]] = True else: # This shouldn't happen as gnu_getopt should already handle # that case. bconfig.LOGGER.error("option unrecognized -- %s" % option) # Collect the text inputs. options["text_files"] = args return options if __name__ == '__main__': main() diff --git a/modules/bibauthorid/lib/bibauthorid_config.py b/modules/bibauthorid/lib/bibauthorid_config.py index 8985617c2..7f0db914d 100644 --- a/modules/bibauthorid/lib/bibauthorid_config.py +++ b/modules/bibauthorid/lib/bibauthorid_config.py @@ -1,307 +1,312 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ bibauthorid_config Part of the framework responsible for supplying configuration options used by different parts of the framework. Note, however, that it's best to declare any configuration options for the modules within themselves. """ import logging.handlers import sys import os.path as osp from invenio.access_control_config import SUPERADMINROLE GLOBAL_CONFIG = True try: from invenio.config import CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS, \ CFG_BIBAUTHORID_MAX_PROCESSES, \ CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_BCTKD_RA, \ CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_NEW_RA, \ CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH, \ CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N, \ CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY, \ CFG_BIBAUTHORID_ATTACH_VA_TO_MULTIPLE_RAS , \ CFG_BIBAUTHORID_ENABLED, \ - CFG_BIBAUTHORID_ON_AUTHORPAGES + CFG_BIBAUTHORID_ON_AUTHORPAGES, \ + CFG_BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE except ImportError: GLOBAL_CONFIG = False # Current version of the framework VERSION = '1.1.0' # make sure current directory is importable FILE_PATH = osp.dirname(osp.abspath(__file__)) if FILE_PATH not in sys.path: sys.path.insert(0, FILE_PATH) # Permission definitions as in actions defined in roles CLAIMPAPER_ADMIN_ROLE = "claimpaperoperators" CLAIMPAPER_USER_ROLE = "claimpaperusers" CMP_ENABLED_ROLE = "paperclaimviewers" CHP_ENABLED_ROLE = "paperattributionviewers" AID_LINKS_ROLE = "paperattributionlinkviewers" CLAIMPAPER_VIEW_PID_UNIVERSE = 'claimpaper_view_pid_universe' CLAIMPAPER_CHANGE_OWN_DATA = 'claimpaper_change_own_data' CLAIMPAPER_CHANGE_OTHERS_DATA = 'claimpaper_change_others_data' CLAIMPAPER_CLAIM_OWN_PAPERS = 'claimpaper_claim_own_papers' CLAIMPAPER_CLAIM_OTHERS_PAPERS = 'claimpaper_claim_others_papers' #Number of persons in a search result for which the first five papers will be shown PERSON_SEARCH_RESULTS_SHOW_PAPERS_PERSON_LIMIT = 10 CMPROLESLCUL = {'guest': 0, CLAIMPAPER_USER_ROLE: 25, CLAIMPAPER_ADMIN_ROLE: 50, SUPERADMINROLE: 50} # Globally enable AuthorID Interfaces. # If False: No guest, user or operator will have access to the system. if GLOBAL_CONFIG: AID_ENABLED = CFG_BIBAUTHORID_ENABLED else: AID_ENABLED = True # Enable AuthorID information on the author pages. if GLOBAL_CONFIG: AID_ON_AUTHORPAGES = CFG_BIBAUTHORID_ON_AUTHORPAGES else: AID_ON_AUTHORPAGES = True # Limit the disambiguation to a specific collections. Leave empty for all # Collections are to be defined as a list of strings LIMIT_TO_COLLECTIONS = [] # Exclude documents that are visible in a collection mentioned here: -EXCLUDE_COLLECTIONS = ["HEPNAMES", "INST"] +EXCLUDE_COLLECTIONS = ["HEPNAMES", "INST", "Deleted", "DELETED", "deleted"] # User info keys for externally claimed records # e.g. for arXiv SSO: ["external_arxivids"] if GLOBAL_CONFIG and CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY: EXTERNAL_CLAIMED_RECORDS_KEY = CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY else: EXTERNAL_CLAIMED_RECORDS_KEY = [] # Lists all filters that are valid to filter the export by. # An example is 'arxiv' to return only papers with a 037 entry named arxiv VALID_EXPORT_FILTERS = ["arxiv"] # Max number of threads to parallelize sql queryes in table_utils updates if GLOBAL_CONFIG and CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS: PERSONID_SQL_MAX_THREADS = CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS else: - PERSONID_SQL_MAX_THREADS = 4 + PERSONID_SQL_MAX_THREADS = 12 # Max number of processes spawned by the disambiguation algorithm if GLOBAL_CONFIG and CFG_BIBAUTHORID_MAX_PROCESSES: BIBAUTHORID_MAX_PROCESSES = CFG_BIBAUTHORID_MAX_PROCESSES else: - BIBAUTHORID_MAX_PROCESSES = 4 + BIBAUTHORID_MAX_PROCESSES = 12 # Threshold for connecting a paper to a person: BCTKD are the papers from the # backtracked RAs found searching back for the papers already connected to the # persons, NEW is for the newly found one if GLOBAL_CONFIG and CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_BCTKD_RA: PERSONID_MIN_P_FROM_BCTKD_RA = CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_BCTKD_RA else: PERSONID_MIN_P_FROM_BCTKD_RA = 0.5 if GLOBAL_CONFIG and CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_NEW_RA: PERSONID_MIN_P_FROM_NEW_RA = CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_NEW_RA else: PERSONID_MIN_P_FROM_NEW_RA = 0.5 # Minimum threshold for the compatibility list of persons to an RA: if no RA # is more compatible that that it will create a new person if GLOBAL_CONFIG and CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH: PERSONID_MAX_COMP_LIST_MIN_TRSH = CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH else: PERSONID_MAX_COMP_LIST_MIN_TRSH = 0.5 if GLOBAL_CONFIG and CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N: PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N = CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N else: PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N = 0.5 #Create_new_person flags thresholds PERSONID_CNP_FLAG_1 = 0.75 PERSONID_CNP_FLAG_MINUS1 = 0.5 # update_personid_from_algorithm person_paper_list for get_person_ra call # minimum flag PERSONID_UPFA_PPLMF = -1 # Update/disambiguation process surname list creation method # Can be either 'mysql' or 'regexp'. # 'mysql' is inerently slow but accurate, 'regexp' is really really fast, but with potentially #different results. 'mysql' left in for compatibility. BIBAUTHORID_LIST_CREATION_METHOD = 'regexp' #Tables Utils debug output TABLES_UTILS_DEBUG = False +AUTHORNAMES_UTILS_DEBUG = False # Is the authorid algorithm allowed to attach a virtual author to multiple # real authors in the last run of the orphan processing? if GLOBAL_CONFIG and CFG_BIBAUTHORID_ATTACH_VA_TO_MULTIPLE_RAS: ATTACH_VA_TO_MULTIPLE_RAS = CFG_BIBAUTHORID_ATTACH_VA_TO_MULTIPLE_RAS else: ATTACH_VA_TO_MULTIPLE_RAS = False # Shall we send from locally defined eMail address or from the users one # when we send out a ticket? Default is True -> send with user's email TICKET_SENDING_FROM_USER_EMAIL = True # Log Level for the message output. # Log Levels are defined in the Python logging system # 0 - 50 (log everything - log exceptions) LOG_LEVEL = 30 # Default logging file name LOG_FILENAME = "job.log" # tables_utils_config TABLE_POPULATION_BUNCH_SIZE = 6000 # Max number of authors on a paper to be considered while creating jobs MAX_AUTHORS_PER_DOCUMENT = 15 # Set limit_authors to true, if papers that are written by collaborations # or by more than MAX_AUTHORS_PER_DOCUMENT authors shall be excluded # The default is False. LIMIT_AUTHORS_PER_DOCUMENT = False # Regexp for the names separation NAMES_SEPARATOR_CHARACTER_LIST = ",;.=\-\(\)" SURNAMES_SEPARATOR_CHARACTER_LIST = ",;" # Path where all the modules live and which prefix the have. MODULE_PATH = ("%s/bibauthorid_comparison_functions/aid_cmp_*.py" % (FILE_PATH,)) ## threshold for adding a va to more than one real authors for ## the add_new_virtualauthor function REALAUTHOR_VA_ADD_THERSHOLD = 0.449 ## parameters for the 'compute real author name' function CONFIDENCE_THRESHOLD = 0.46 P_THRESHOLD = 0.46 INVERSE_THRESHOLD_DELTA = 0.1 ## parameters for the comparison function chain CONSIDERATION_THRESHOLD = 0.04 ## Set up complex logging system: ## - Setup Default logger, which logs to console on critical events only ## - on init call, set up a three-way logging system: ## - 1. Log to console anything ERROR or higher. ## - 2. Log everything LOG_LEVEL or higher to memory and ## - 3. Flush to file in the specified path. LOGGERS = [] HANDLERS = {} ## Default logger and handler DEFAULT_HANDLER = logging.StreamHandler() DEFAULT_LOG_FORMAT = logging.Formatter('%(levelname)-8s %(message)s') DEFAULT_HANDLER.setFormatter(DEFAULT_LOG_FORMAT) DEFAULT_HANDLER.setLevel(logging.CRITICAL) ## workaround for the classes to detect that LOGGER is actually an instance ## of type logging. LOGGER = logging.getLogger("Dummy") LOGGER.addHandler(DEFAULT_HANDLER) LOGGER.setLevel(LOG_LEVEL) ## force skip ui arxiv stub page (specific fore inspire) -BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE = True +if GLOBAL_CONFIG: + BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE = CFG_BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE +else: + BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE = True BIBAUTHORID_CFG_INSPIRE_LOGIN = 'https://arxiv.org/inspire_login' if not LOGGERS: LOGGERS.append(logging.getLogger("Dummy")) LOGGERS[0].addHandler(DEFAULT_HANDLER) def init_logger(logfile=None): ''' Set up specific logger for 3-way logging. @param logfile: path to file which will be used for flushing the memory log cache. @type logfile: string ''' if not logfile: return False logging.addLevelName(25, "LOG") HANDLERS['filelog'] = logging.FileHandler(logfile, mode="w") HANDLERS['memlog'] = logging.handlers.MemoryHandler(1000, logging.ERROR, HANDLERS['filelog']) HANDLERS['console'] = logging.StreamHandler() formatter = logging.Formatter("%(asctime)s -- %(levelname)-8s %(message)s", "%Y-%m-%d %H:%M:%S") HANDLERS['filelog'].setFormatter(formatter) HANDLERS['memlog'].setFormatter(formatter) HANDLERS['console'].setFormatter(formatter) HANDLERS['memlog'].setLevel(LOG_LEVEL) HANDLERS['console'].setLevel(logging.ERROR) if LOGGERS: LOGGERS[:] = [] LOGGERS.append(logging.getLogger("")) LOGGERS[0].setLevel(LOG_LEVEL) LOGGERS[0].addHandler(HANDLERS['memlog']) LOGGERS[0].addHandler(HANDLERS['console']) def stop_and_close_logger(): ''' Closes and detaches all handlers from the logging instances. Necessary to flush the latest contents of the memory handler to file. ''' HANDLERS['memlog'].close() HANDLERS['filelog'].close() HANDLERS['console'].close() LOGGER.removeHandler(HANDLERS['memlog']) LOGGER.removeHandler(HANDLERS['console']) ## Logging 'device' used by the classes to write log messages LOGGER = LOGGERS[0] ## STANDALONE defines if the algorithm is run within the environment of ## Invenio/Inspire or if it is used individually (e.g. Grid usage) STANDALONE = False try: import dbquery except ImportError, err: STANDALONE = True LOGGER.warning('Bibauthorid is running in standalone mode.\n' '-> Access to the database is not supported.') diff --git a/modules/bibauthorid/lib/bibauthorid_daemon.py b/modules/bibauthorid/lib/bibauthorid_daemon.py index 5512e1451..425592ab2 100644 --- a/modules/bibauthorid/lib/bibauthorid_daemon.py +++ b/modules/bibauthorid/lib/bibauthorid_daemon.py @@ -1,1044 +1,1089 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Bibauthorid Daemon This module IS NOT standalone safe - it should never be run this way. """ import sys import os import Queue import os.path as osp import bibauthorid_config as bconfig import bibauthorid_structs as dat from bibauthorid_tables_utils import populate_doclist_for_author_surname from bibauthorid_tables_utils import find_all_last_names from bibauthorid_tables_utils import write_mem_cache_to_tables from bibauthorid_tables_utils import populate_authornames from bibauthorid_tables_utils import populate_authornames_bibrefs_from_authornames from bibauthorid_tables_utils import get_len_authornames_bibrefs from bibauthorid_tables_utils import check_and_create_aid_tables from bibauthorid_tables_utils import load_mem_cache_from_tables from bibauthorid_tables_utils import load_records_to_mem_cache from bibauthorid_tables_utils import init_authornames from bibauthorid_tables_utils import get_papers_recently_modified from bibauthorid_tables_utils import update_authornames_tables_from_paper from bibauthorid_tables_utils import authornames_tables_gc from bibauthorid_tables_utils import update_tables_from_mem_cache from bibauthorid_tables_utils import empty_aid_tables from bibauthorid_virtualauthor_utils import add_minimum_virtualauthor from bibauthorid_virtualauthor_utils import get_va_ids_by_recid_lname from bibauthorid_virtualauthor_utils import delete_virtual_author from bibauthorid_virtualauthor_utils import get_virtualauthor_records from bibauthorid_realauthor_utils import get_realauthors_by_virtuala_id from bibauthorid_realauthor_utils import remove_va_from_ra from bibauthorid_realauthor_utils import del_ra_data_by_vaid from bibauthorid_file_utils import write_mem_cache_to_files from bibauthorid_file_utils import populate_structs_from_files from bibauthorid_file_utils import tail from bibauthorid import start_full_disambiguation from bibauthorid import start_computation from bibauthorid_utils import get_field_values_on_condition from bibauthorid_utils import split_name_parts from bibauthorid_authorname_utils import update_doclist from bibauthorid_personid_tables_utils import get_user_log from bibauthorid_personid_tables_utils import insert_user_log from bibauthorid_personid_tables_utils import update_personID_table_from_paper from bibauthorid_personid_tables_utils import update_personID_from_algorithm from bibauthorid_personid_tables_utils import personid_remove_automatically_assigned_papers from bibauthorid_personid_tables_utils import personid_fast_assign_papers import bibtask # Global variables allowing to retain the progress of the task. _INDEX = 0 _RECIDS_NUMBER = 0 def bibauthorid_daemon(): """Constructs the Bibauthorid bibtask.""" bibtask.task_init(authorization_action='runbibclassify', authorization_msg="Bibauthorid Task Submission", description=""" Purpose: Disambiguate Authors and find their identities. Examples: - Process all records that hold an author with last name 'Ellis': $ bibauthorid -u admin --lastname 'Ellis' - Process all records and regard all authors: $ bibauthorid -u admin --process-all - Prepare job packages in folder 'gridfiles' with the sub directories prefixed with 'task' and a maximum number of 2000 records per package: $ bibauthorid -u admin --prepare-grid -d gridfiles -p task -m 2000 """, help_specific_usage=""" NOTE: Options -n, -a, -U, -G and -R are mutually exclusive (XOR)! -n, --lastname=STRING Process only authors with this last name. -a, --process-all The option for cleaning all authors. -U, --update-universe Update bibauthorid universe. Find modified and newly entered records and process all the authors on these records. -G, --prepare-grid Prepares a set of files that supply the pre-clustered data needed for stand alone job to run (e.g. needed on the grid). The behavior of this export can be controlled with the options -d (required), -p and -m (both optional). -R, --load-grid-results Loads the results from the grid jobs and writes them to the database. The behavior of this import can be controlled with the options -d (required). -d, --data-dir=DIRNAME Specifies the data directory, in which the data for the grid preparation will be stored to or loaded from. It requires the -G or -R switch. -p, --prefix=STRING Specifies the prefix of the directories created under the --data-dir directory. Optional. Defaults to 'job'. It requires the -G switch. -m, --max-records Specifies the number of records that shall be stored per job package. Optional. Defaults to 4000 and requires -G switch. --update-cache Updates caches to the newly introduced changes (new and modified documents). This should be called daily or better more then once per day, to ensure the correct operation of the frontend (and the backend). --clean-cache Clean the cache from out of date contents (deleted documents). -r, --record-ids Specifies a list of record ids. To use as on option for --update-universe to limit the update to the selected records --all-records To use as on option for --update-universe to perform the update an all existing record ids. Be WARNED that this will empty and re-fill all aid* tables in the process! --repair-personid Deletes untouched person entities to then re-create and updated these entities. --fast-update-personid Updates personid adding not yet assigned papers to the system, - in a fast, best effort basis. + in a fast, best effort basis. Use -r to limit to a comma separated + set of records. + --personid-gc Garbage collects personid for stale records. Use -r to limit to a comma + separated set of records. """, version="Invenio Bibauthorid v%s" % bconfig.VERSION, specific_params=("r:d:n:p:m:GURa", [ "data-dir=", "lastname=", "prefix=", "max-records=", "process-all", "prepare-grid", "load-grid-results", "update-universe", "update-cache", "clean-cache", "record-ids=", "all-records", "repair-personid", - "fast-update-personid=" + "fast-update-personid", + "personid-gc" ]), task_submit_elaborate_specific_parameter_fnc= _task_submit_elaborate_specific_parameter, task_submit_check_options_fnc=_task_submit_check_options, task_run_fnc=_task_run_core) def _task_submit_elaborate_specific_parameter(key, value, opts, args): """ Given the string key it checks it's meaning, eventually using the value. Usually, it fills some key in the options dict. It must return True if it has elaborated the key, False, if it doesn't know that key. """ if key in ("-n", "--lastname"): if value == "None," or value == "None": bibtask.write_message("The value specified for --lastname must " "be a valid name. Not '%s'." % value, stream=sys.stdout, verbose=0) return False bibtask.task_set_option('lastname', value) elif key in ("-a", "--process-all"): bibtask.task_set_option("process_all", True) elif key in ("-U", "--update-universe"): bibtask.task_set_option("update", True) elif key in ("-G", "--prepare-grid"): bibtask.task_set_option("prepare_grid", True) elif key in ("-R", "--load-grid-results"): bibtask.task_set_option("load_grid_results", True) elif key in ("-d", "--data-dir"): bibtask.task_set_option("data_dir", value) elif key in ("-p", "--prefix"): bibtask.task_set_option("prefix", value) elif key in ("-m", "--max-records"): bibtask.task_set_option("max_records", value) elif key in ("--update-cache",): bibtask.task_set_option("update_cache", True) elif key in ("--clean-cache",): bibtask.task_set_option("clean_cache", True) elif key in ("--record-ids", '-r'): if value.count("="): value = value[1:] value = value.split(",") bibtask.task_set_option("record_ids", value) elif key in ("--all-records"): bibtask.task_set_option("all_records", True) elif key in ("--repair-personid"): bibtask.task_set_option("repair_pid", True) elif key in ("--fast-update-personid"): - bibtask.task_set_option("fast_update_personid", value) + bibtask.task_set_option("fast_update_personid", True) + + elif key in ("--personid-gc"): + bibtask.task_set_option("personid_gc", True) else: return False return True def _task_run_core(): """ Runs the requested task in the bibsched environment. """ lastname = bibtask.task_get_option('lastname') process_all = bibtask.task_get_option('process_all') prepare_grid = bibtask.task_get_option('prepare_grid') load_grid = bibtask.task_get_option('load_grid_results') data_dir = bibtask.task_get_option('data_dir') prefix = bibtask.task_get_option('prefix') max_records_option = bibtask.task_get_option('max_records') update = bibtask.task_get_option('update') clean_cache = bibtask.task_get_option('clean_cache') update_cache = bibtask.task_get_option('update_cache') record_ids = bibtask.task_get_option('record_ids') record_ids_nested = None all_records = bibtask.task_get_option('all_records') repair_pid = bibtask.task_get_option('repair_pid') fast_update_personid = bibtask.task_get_option('fast_update_personid') + personid_gc = bibtask.task_get_option('personid_gc') if record_ids: record_ids_nested = [[p] for p in record_ids] - if fast_update_personid: - fast_update_personid = [[p] for p in fast_update_personid] # automated_daemon_mode_p = True if lastname: bibtask.write_message("Processing last name %s" % (lastname), stream=sys.stdout, verbose=0) if process_all: if bconfig.STANDALONE: bibtask.write_message("Processing not possible in standalone!", stream=sys.stdout, verbose=0) return 0 bibtask.write_message("Processing all names...", stream=sys.stdout, verbose=0) lengths = get_len_authornames_bibrefs() if not check_and_create_aid_tables(): bibtask.write_message("Failed to create database tables!", stream=sys.stdout, verbose=0) return 0 if lengths['names'] < 1: bibtask.write_message("Populating Authornames table. It's Empty.", stream=sys.stdout, verbose=0) bibtask.task_update_progress('Populating Authornames table.') populate_authornames() insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', comment='bibauthorid_daemon, ' 'update_authornames_tables_from_paper') if lengths['bibrefs'] < 1: bibtask.write_message("Populating Bibrefs lookup. It's Empty.", stream=sys.stdout, verbose=0) bibtask.task_update_progress('Populating Bibrefs lookup table.') populate_authornames_bibrefs_from_authornames() bibtask.task_update_progress('Processing all authors.') start_full_disambiguation(last_names="all", process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True) update_personID_from_algorithm() insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe') if prepare_grid: bibtask.write_message("Preparing Grid Job", stream=sys.stdout, verbose=0) data_dir_name = "grid_data" workdir_prefix = "job" max_records = 4000 if data_dir: data_dir_name = data_dir if prefix: workdir_prefix = prefix if max_records_option: max_records = max_records_option _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records) if load_grid: bibtask.write_message("Reading Grid Job results and will write" " them to the database.", stream=sys.stdout, verbose=0) _write_data_files_to_db(data_dir) if update or update_cache: bibtask.write_message("update-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress('update-cache: Processing recently' ' updated papers') _run_update_authornames_tables_from_paper(record_ids_nested, all_records) bibtask.write_message("update-cache: Finished processing papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress('update-cache: DONE') if update: bibtask.write_message("updating authorid universe", stream=sys.stdout, verbose=0) bibtask.task_update_progress('updating authorid universe') _update_authorid_universe(record_ids, all_records) bibtask.write_message("done updating authorid universe", stream=sys.stdout, verbose=0) bibtask.task_update_progress('done updating authorid universe') if clean_cache: bibtask.write_message("clean-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress('clean-cache: Processing recently updated' ' papers for names') _run_authornames_tables_gc() bibtask.write_message("update-cache: Finished cleaning authornames " "tables", stream=sys.stdout, verbose=0) bibtask.task_update_progress('clean-cache: Processing recently updated' ' papers for persons') _run_update_personID_table_from_paper(record_ids_nested, all_records) bibtask.write_message("update-cache: Finished cleaning PersonID" " table", stream=sys.stdout, verbose=0) bibtask.task_update_progress('clean-cache: DONE') if repair_pid: bibtask.task_update_progress('Updating names cache...') _run_update_authornames_tables_from_paper() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Removing person entities not touched by ' 'humans...') personid_remove_automatically_assigned_papers() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Updating person entities...') update_personID_from_algorithm() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Cleaning person tables...') _run_update_personID_table_from_paper() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('All repairs done.') if fast_update_personid: bibtask.task_update_progress('Updating personid...') - _run_personid_fast_assign_papers(fast_update_personid) - bibtask.task_update_progress('Update finished...') + _run_personid_fast_assign_papers(record_ids_nested) + bibtask.task_update_progress('PersonID update finished!') + + if personid_gc: + bibtask.task_update_progress('Updating personid (GC)...') + _run_personid_gc(record_ids_nested) + bibtask.task_update_progress('PersonID update finished (GC)!') #TODO: remember to pass the papers list! return 1 def _task_submit_check_options(): """ Required by bibtask. Checks the options. """ lastname = bibtask.task_get_option('lastname') process_all = bibtask.task_get_option('process_all') prepare_grid = bibtask.task_get_option('prepare_grid') load_grid = bibtask.task_get_option('load_grid_results') data_dir = bibtask.task_get_option('data_dir') prefix = bibtask.task_get_option('prefix') max_records = bibtask.task_get_option('max_records') update = bibtask.task_get_option('update') clean_cache = bibtask.task_get_option('clean_cache') update_cache = bibtask.task_get_option('update_cache') record_ids = bibtask.task_get_option('record_ids') all_records = bibtask.task_get_option('all_records') repair_pid = bibtask.task_get_option('repair_pid') fast_update_personid = bibtask.task_get_option('fast_update_personid') + personid_gc = bibtask.task_get_option('personid_gc') if (record_ids and all_records): bibtask.write_message("ERROR: conflicting options: --record-ids and " "--all-records cannot be specified at the same " "time.", stream=sys.stdout, verbose=0) return False if (lastname == "None," or lastname == "None"): lastname = False if (not lastname and not process_all and not update and not prepare_grid and not load_grid and not clean_cache - and not update_cache and not fast_update_personid): + and not update_cache and not fast_update_personid and not personid_gc): bibtask.write_message("ERROR: One of the options -a, -n, -U, -G, -R, " - "--clean-cache, --update-cache, --fast-update-personid is" - " required!", stream=sys.stdout, verbose=0) + "--clean-cache, --update-cache, --fast-update-personid " + "--personid-gc is required!", stream=sys.stdout, verbose=0) return False elif not (bool(lastname) ^ bool(process_all) ^ bool(update) ^ bool(prepare_grid) ^ bool(load_grid) ^ bool(clean_cache) - ^ bool(update_cache) ^ bool(repair_pid) ^ bool(fast_update_personid)): + ^ bool(update_cache) ^ bool(repair_pid) ^ bool(fast_update_personid) + ^ bool(personid_gc)): bibtask.write_message("ERROR: Options -a -n -U -R -G --clean-cache " "--update-cache --repair-personid --fast-update-personid " - "are mutually" + "--personid-gc are mutually" " exclusive!", stream=sys.stdout, verbose=0) return False elif ((not prepare_grid and (data_dir or prefix or max_records)) and (not load_grid and (data_dir))): bibtask.write_message("ERROR: The options -d, -m and -p require -G or " "-R to run!", stream=sys.stdout, verbose=0) return False elif load_grid and not bool(data_dir): bibtask.write_message("ERROR: The option -R requires the option -d " "to run!", stream=sys.stdout, verbose=0) return False return True def _write_data_files_to_db(data_dir_name): ''' Reads all the files of a specified directory and writes the content to the memory cache and from there to the database. @param data_dir_name: Directory where to look for the files @type data_dir_name: string ''' if data_dir_name.endswith("/"): data_dir_name = data_dir_name[0:-1] if not data_dir_name: bibtask.write_message("Data directory not specified. Task failed.", stream=sys.stdout, verbose=0) return False if not osp.isdir(data_dir_name): bibtask.write_message("Specified Data directory is not a directory. " "Task failed.", stream=sys.stdout, verbose=0) return False job_dirs = os.listdir(data_dir_name) total = len(job_dirs) status = 0 for job_dir in job_dirs: status += 1 job_dir = "%s/%s" % (data_dir_name, job_dir) if not osp.isdir(job_dir): bibtask.write_message("This is not a directory and therefore " "skipped: %s." % job_dir, stream=sys.stdout, verbose=0) continue results_dir = "%s/results/" % (job_dir,) if not osp.isdir(results_dir): bibtask.write_message("No result set found in %s" % (results_dir,), stream=sys.stdout, verbose=0) continue log_name = osp.abspath(job_dir).split("/") logfile = "%s/%s.log" % (job_dir, log_name[-1]) logfile_lastline = "" if not osp.isfile(logfile): bibtask.write_message("No log file found in %s" % (job_dir,), stream=sys.stdout, verbose=0) continue try: logfile_lastline = tail(logfile) except IOError: logfile_lastline = "" if logfile_lastline.count("Finish! The computation finished in") < 1: bibtask.write_message("Log file indicates broken results for %s" % (job_dir,), stream=sys.stdout, verbose=0) continue correct_files = set(['realauthors.dat', 'ids.dat', 'virtual_author_clusters.dat', 'virtual_authors.dat', 'doclist.dat', 'virtual_author_data.dat', 'authornames.dat', 'virtual_author_cluster_cache.dat', 'realauthor_data.dat', 'ra_va_cache.dat'] ) result_files = os.listdir(results_dir) if not correct_files.issubset(set(result_files)): bibtask.write_message("Reults folder does not hold the " "correct files: %s" % (results_dir,), stream=sys.stdout, verbose=0) continue bibtask.task_update_progress('Loading job %s of %s: %s' % (status, total, log_name[-1])) if (populate_structs_from_files(results_dir, results=True) and write_mem_cache_to_tables(sanity_checks=True)): bibtask.write_message("All Done.", stream=sys.stdout, verbose=0) else: bibtask.write_message("Could not write data to the tables from %s" % (results_dir,), stream=sys.stdout, verbose=0) def _prepare_data_files_from_db(data_dir_name="grid_data", workdir_prefix="job", max_records=4000): ''' Prepares grid jobs. Is a task running in bibsched. Meaning: 1. Find all last names in the database 2. For each last name: - find all documents regarding this last name (ignore first names) - if number of documents loaded into memory exceeds max_records, write the memory cache into files (cf. Files section). Each write back procedure will happen into a newly created directory. The prefix for the respective job directory may be specified as well as the name of the data directory where these job directories will be created. Files: - authornames.dat - virtual_authors.dat - virtual_author_data.dat - virtual_author_clusters.dat - virtual_author_cluster_cache.dat - realauthors.dat - realauthor_data.dat - doclist.dat - records.dat - ids.dat - ra_va_cache.dat @param data_dir_name: the name of the directory that will hold all the sub directories for the jobs. @type data_dir_name: string @param workdir_prefix: prefix for the job sub directories. @type workdir_prefix: string @param max_records: maximum number of records after which the memory cache is to be flushed to files. @type max_records: int ''' try: max_records = int(max_records) except ValueError: max_records = 4000 bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0) bibtask.write_message("Limiting files to %s records" % (max_records,), stream=sys.stdout, verbose=0) bibtask.task_update_progress('Loading last names...') last_names = find_all_last_names() last_name_queue = Queue.Queue() for last_name in sorted(last_names): last_name_queue.put(last_name) total = len(last_names) status = 1 bibtask.write_message("Done. Loaded %s last names." % (total), stream=sys.stdout, verbose=0) job_id = 0 data_dir = "" if data_dir_name.startswith("/"): data_dir = data_dir_name else: data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name) if not data_dir.endswith("/"): data_dir = "%s/" % (data_dir,) job_lnames = [] while True: if last_name_queue.empty(): bibtask.write_message("Done with all names.", stream=sys.stdout, verbose=0) break bibtask.task_sleep_now_if_required(can_stop_too=False) lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del(lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue job_lnames.append(lname) bibtask.task_update_progress('Preparing job %d of %d: %s.' % (status, total, lname)) bibtask.write_message(("Processing: %s (%d/%d).") % (lname, status, total), stream=sys.stdout, verbose=0) bibtask.task_sleep_now_if_required(can_stop_too=False) populate_doclist_for_author_surname(lname) post_remove_names = set() for name in [row['name'] for row in dat.AUTHOR_NAMES if not row['processed']]: potential_removal = "%s," % (name.split(',')[0],) if not potential_removal == "%s" % (lname,): post_remove_names.add(potential_removal) if len(post_remove_names) > 1: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bibtask.write_message(("-> Removed %s entries from the " + "computation list: %s") % (removed, removed_names), stream=sys.stdout, verbose=0) total -= removed if lname_list: last_name_queue.put(lname_list) if len(dat.RELEVANT_RECORDS) >= max_records: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) bibtask.task_sleep_now_if_required(can_stop_too=True) job_lnames = [] job_id += 1 status += 1 if dat.RELEVANT_RECORDS: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) bibtask.task_sleep_now_if_required(can_stop_too=True) return True def _update_authorid_universe(record_ids=None, all_records=False): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len([row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log(25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]: for author_id in docs['authornameids']: author_name = [an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id] refrecs = [ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id] refrec = -1 if len(refrecs) > 1: refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = None updated_records = [] if not record_ids and not all_records: last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message("Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return elif record_ids and not all_records: updated_records = record_ids elif not record_ids and all_records: bibtask.write_message("Update is going to empty all aid tables...", stream=sys.stdout, verbose=0) empty_aid_tables() bibtask.write_message("Update authorid will operate on all! records.", stream=sys.stdout, verbose=0) bibtask.task_update_progress('Update is operating on all! records.') start_full_disambiguation(process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True) bibtask.task_update_progress('Update is done.') return bibtask.task_sleep_now_if_required(can_stop_too=True) authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [row for row in authors if row['db_name'] == rec_author] if author_in_list: for upd in [row for row in authors if row['db_name'] == rec_author]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({'db_name': rec_author, 'records': [rec], 'last_name': last_name}) for status, author_last_name in enumerate(author_last_names): bibtask.task_sleep_now_if_required(can_stop_too=False) current_authors = [row for row in authors if row['last_name'] == author_last_name] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name']] if not authornamesid: bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records(va_id, "orig_authorname_id") for an_list in [row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") bibtask.task_sleep_now_if_required(can_stop_too=False) start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log(25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) bibtask.task_sleep_now_if_required(can_stop_too=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id,)) bconfig.LOGGER.log(25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') bibtask.task_sleep_now_if_required(can_stop_too=False) update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying " "with bibauthorid!") def _write_to_files(work_dir, job_lnames): ''' Wrapper function around this internal write process. Triggers the write-back to the files to the mem cache. @param work_dir: where shall the files be stored? @type work_dir: string @param job_lnames: list of names @type job_lnames: list ''' bibtask.task_update_progress('Writing to files in %s' % (work_dir)) bibtask.write_message("Writing cluster with %s entries to " "files in %s" % (len(dat.RELEVANT_RECORDS), work_dir,), stream=sys.stdout, verbose=0) if not os.path.exists(work_dir): os.mkdir(work_dir) write_mem_cache_to_files(work_dir, job_lnames) dat.reset_mem_cache(True) def _run_update_authornames_tables_from_paper(record_ids=None, all_records=False): ''' Runs the update on the papers which have been modified since the last run @note: This should be run as often as possible to keep authornames and authornames_bibrefs cache tables up to date. ''' if not all_records and not record_ids: last_log = get_user_log(userinfo='daemon', action='UATFP', only_most_recent=True) if len(last_log) >= 1: #select only the most recent papers recently_modified, min_date = get_papers_recently_modified(date=last_log[0][2]) - insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_authornames_tables_from_paper', timestamp=min_date[0][0]) + insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', + comment='bibauthorid_daemon, update_authornames_tables_from_paper', + timestamp=min_date[0][0]) if not recently_modified: bibtask.write_message("update_authornames_tables_from_paper: " "All names up to date.", stream=sys.stdout, verbose=0) else: - bibtask.write_message("update_authornames_tables_from_paper: Running on %s papers " % str(len(recently_modified)), stream=sys.stdout, verbose=0) + bibtask.write_message( + "update_authornames_tables_from_paper: Running on %s papers " % str( + len(recently_modified)), stream=sys.stdout, verbose=0) update_authornames_tables_from_paper(recently_modified) else: #this is the first time the utility is run, run on all the papers? #Probably better to write the log on the first authornames population #@todo: authornames population writes the log recently_modified, min_date = get_papers_recently_modified() - insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_authornames_tables_from_paper', timestamp=min_date[0][0]) - bibtask.write_message("update_authornames_tables_from_paper: Running on %s papers " % str(len(recently_modified)), stream=sys.stdout, verbose=0) + insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', + comment='bibauthorid_daemon, update_authornames_tables_from_paper', + timestamp=min_date[0][0]) + bibtask.write_message( + "update_authornames_tables_from_paper: Running on %s papers " % str( + len(recently_modified)), stream=sys.stdout, verbose=0) update_authornames_tables_from_paper(recently_modified) else: bibtask.write_message("update_authornames_tables_from_paper: Running " "on all papers ", stream=sys.stdout, verbose=0) update_authornames_tables_from_paper(record_ids) def _run_update_personID_table_from_paper(record_ids=None, all_records=False): ''' Runs the update on the papers which have been modified since the last run This is removing no-longer existing papers from the personid table. @note: Update recommended monthly. @warning: quite resource intensive. ''' if not record_ids and not all_records: last_log = get_user_log(userinfo='daemon', action='UPITFP', only_most_recent=True) if len(last_log) >= 1: #select only the most recent papers recently_modified, min_date = get_papers_recently_modified(date=last_log[0][2]) - insert_user_log('daemon', '-1', 'UPITFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_personID_table_from_paper', timestamp=min_date[0][0]) + insert_user_log('daemon', '-1', 'UPITFP', 'bibsched', 'status', + comment='bibauthorid_daemon, update_personID_table_from_paper', + timestamp=min_date[0][0]) if not recently_modified: bibtask.write_message("update_personID_table_from_paper: " "All person entities up to date.", stream=sys.stdout, verbose=0) else: - bibtask.write_message("update_personID_table_from_paper: Running on: " + str(recently_modified), stream=sys.stdout, verbose=0) + bibtask.write_message("update_personID_table_from_paper: Running on: " + + str(recently_modified), stream=sys.stdout, verbose=0) update_personID_table_from_paper(recently_modified) else: # Should not process all papers, hence authornames population writes # the appropriate log. In case the log is missing, process everything. recently_modified, min_date = get_papers_recently_modified() - insert_user_log('daemon', '-1', 'UPITFP', 'bibsched', 'status', comment='bibauthorid_daemon, update_personID_table_from_paper', timestamp=min_date[0][0]) - bibtask.write_message("update_personID_table_from_paper: Running on: " + str(recently_modified), stream=sys.stdout, verbose=0) + insert_user_log('daemon', '-1', 'UPITFP', 'bibsched', 'status', + comment='bibauthorid_daemon, update_personID_table_from_paper', + timestamp=min_date[0][0]) + bibtask.write_message("update_personID_table_from_paper: Running on: " + + str(recently_modified), stream=sys.stdout, verbose=0) update_personID_table_from_paper(recently_modified) # @todo: develop a method that removes the respective VAs from the database # as well since no reference will be there for them any longer. VAs can be # found by searching for the authornames ID in the VA table. The # method has to kill RA data based on the VA (cf. del_ra_data_by_vaid in # ra utils as a reference), all VA2RA links, all VA data, all VAs and # finally all doclist refs that point to the respective bibrefs. else: update_personID_table_from_paper(record_ids) def _run_authornames_tables_gc(): ''' Runs the garbage collector on the authornames tables, to get rid of deleted bibrefs in the respective author tables ''' - insert_user_log('daemon', '-1', 'ANTGC', 'bibsched', 'status', comment='bibauthorid_daemon, authornames_tables_gc') + insert_user_log('daemon', '-1', 'ANTGC', 'bibsched', 'status', + comment='bibauthorid_daemon, authornames_tables_gc') authornames_tables_gc() def _run_personid_fast_assign_papers(paperslist): - insert_user_log('daemon', '-1', 'PFAP', 'bibsched', 'status', comment='bibauthorid_daemon, personid_fast_assign_papers') - personid_fast_assign_papers(paperslist) + insert_user_log('daemon', '-1', 'PFAP', 'bibsched', 'status', + comment='bibauthorid_daemon, personid_fast_assign_papers on ' + str(paperslist)) + if not paperslist: + #update_authornames_tables_from_paper() + personid_fast_assign_papers() + else: + #update_authornames_tables_from_paper(paperslist) + personid_fast_assign_papers(paperslist) + +def _run_personid_gc(paperslist): + insert_user_log('daemon', '-1', 'PGC', 'bibsched', 'status', + comment='bibauthorid_daemon, personid_gc (update_personid_from_papers) on ' + + str(paperslist)) + if not paperslist: + #update_authornames_tables_from_paper() + update_personID_table_from_paper() + else: + #update_authornames_tables_from_paper(paperslist) + update_personID_table_from_paper(paperslist) diff --git a/modules/bibauthorid/lib/bibauthorid_personid_tables_utils.py b/modules/bibauthorid/lib/bibauthorid_personid_tables_utils.py index 0a7f9a0f2..5606e2845 100644 --- a/modules/bibauthorid/lib/bibauthorid_personid_tables_utils.py +++ b/modules/bibauthorid/lib/bibauthorid_personid_tables_utils.py @@ -1,3169 +1,3168 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ bibauthorid_personid_tables_utils Bibauthorid's personid related DB handler """ import sys import time import threading import datetime import bibauthorid_config as bconfig import re import os try: import multiprocessing from multiprocessing import Queue # FIXME: Queue import below override this! except ImportError: # FIXME: multiprocessing is not required dependency, so should be # optional pass from Queue import Empty from Queue import Queue from invenio.config import CFG_ETCDIR from bibauthorid_utils import split_name_parts, create_normalized_name, create_canonical_name from bibauthorid_utils import clean_name_string, get_field_values_on_condition from bibauthorid_authorname_utils import soft_compare_names, compare_names #from bibauthorid_authorname_utils import create_name_tuples -from bibauthorid_authorname_utils import names_are_equal_composites -from bibauthorid_authorname_utils import names_are_equal_gender -from bibauthorid_authorname_utils import names_are_substrings -from bibauthorid_authorname_utils import names_are_synonymous -from bibauthorid_authorname_utils import names_minimum_levenshtein_distance +from bibauthorid_authorname_utils import full_names_are_equal_composites +from bibauthorid_authorname_utils import full_names_are_equal_gender +from bibauthorid_authorname_utils import full_names_are_substrings +from bibauthorid_authorname_utils import full_names_are_synonymous +from bibauthorid_authorname_utils import full_names_minimum_levenshtein_distance from bibauthorid_tables_utils import get_bibrefs_from_name_string, update_authornames_tables_from_paper from invenio.search_engine import perform_request_search from threading import Thread from operator import itemgetter try: from dbquery import run_sql, close_connection#, deserialize_via_marshal from dbquery import OperationalError, ProgrammingError from access_control_engine import acc_authorize_action # from webuser import collect_user_info from data_cacher import DataCacher except ImportError: from invenio.data_cacher import DataCacher from invenio.dbquery import run_sql, close_connection #from invenio.dbquery import deserialize_via_marshal from invenio.dbquery import OperationalError, ProgrammingError from invenio.access_control_engine import acc_authorize_action # from invenio.webuser import collect_user_info DATA_CACHERS = [] """ DATA_CACHERS is a list of Data Cacher objects to be persistent in memory """ class PersonIDStatusDataCacher(DataCacher): ''' Data Cacher to monitor the existence of personid data ''' def __init__(self): ''' Initializes the Data Cacher ''' def cache_filler(): ''' Sets the Data Cacher content to True if the table is not empty ''' try: res = run_sql("SELECT count(personid) FROM aidPERSONID " "where tag='paper'") if res and res[0] and res[0][0] > 0: return True else: return False except Exception: # database problems, return empty cache return False def timestamp_verifier(): ''' Verifies that the table is still empty every 2 hours ''' dt = datetime.datetime.now() td = dt - datetime.timedelta(hours=2) return td.strftime("%Y-%m-%d %H:%M:%S") DataCacher.__init__(self, cache_filler, timestamp_verifier) def get_recids_affected_since(last_timestamp): ''' Returns a list of recids which have been manually changed since timestamp @TODO: extend the system to track and signal even automatic updates (unless a full reindex is acceptable in case of magic automatic update) @param: last_timestamp: last update, datetime.datetime ''' vset = set() values = run_sql("select value from aidUSERINPUTLOG where timestamp > %s", (last_timestamp,)) for v in values: if ',' in v[0] and ':' in v[0]: vset.add(v[0].split(',')[1]) pids = run_sql("select distinct personid from aidUSERINPUTLOG where timestamp > %s", (last_timestamp,)) pidlist = [p[0] for p in pids if p[0] >= 0] values = [] if len(pidlist) > 1: values = run_sql("select data from aidPERSONID where tag='paper' and personid in %s", (pidlist,)) elif len(pidlist) == 1: values = run_sql("select data from aidPERSONID where tag='paper' and personid = %s", (pidlist[0],)) for v in values: if ',' in v[0] and ':' in v[0]: vset.add(v[0].split(',')[1]) # transform output to list of integers, since we are returning recIDs: return [int(recid) for recid in list(vset)] def get_persons_affected_since(last_timestamp, return_alt_names=False, return_all_person_papers=False): recids = get_recids_affected_since(last_timestamp) return get_persons_from_recids(recids, return_alt_names, return_all_person_papers) def remove_empty_personids(): pids = run_sql("select distinct personid from aidPERSONID order by personid") for pid in pids: #print "Considering: ", pid[0] papers = run_sql("select * from aidPERSONID where personid=%s and tag='paper'", (pid[0],)) if len(papers) < 1: print "Considering: ", pid[0] print '|- no papers, deleting!' run_sql("delete from aidPERSONID where personid=%s", (pid[0],)) else: positive_papers = run_sql("select * from aidPERSONID where personid=%s and tag='paper' and flag > '-2'", (pid[0],)) if len(positive_papers) < 1: print "Considering: ", pid[0] print '|- no positive papers, deleting!' run_sql("delete from aidPERSONID where personid=%s", (pid[0],)) def populate_authorpages_cache(pids=None): ''' Populates / updates author pages caches for all pids or pids in given list @param pids: optional (('pid1',), ('pid2',), ...) ''' # from invenio.websearch_webinterface import WebInterfaceAuthorPages if not pids: pids = run_sql("select distinct personid from aidPERSONID order by personid") # class blah(object): # argd = "" # def __init__(self): # pass # class blah(object): # argd = "" # _session = {'_accessed': 1304323116.8829861, # '_created': 1304323116.8829839, # '_data': {'uid': 1, # 'user_info': {'agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.57 Safari/534.24', # 'email': 'samuele.carli@cern.ch', # 'group': [], # 'guest': '0', # 'language': 'en', # 'login_method': 'Local', # 'nickname': 'admin', # 'precached_permitted_restricted_collections': [], # 'precached_useadmin': True, # 'precached_usealerts': True, # 'precached_useapprove': True, # 'precached_usebaskets': True, # 'precached_usegroups': True, # 'precached_useloans': True, # 'precached_usemessages': True, # 'precached_usepaperattribution': True, # 'precached_usepaperclaim': True, # 'precached_usestats': True, # 'precached_viewclaimlink': False, # 'precached_viewsubmissions': False, # 'referer': 'https://pcgssiz80.cern.ch/youraccount/login', # 'remote_host': '', # 'remote_ip': '128.141.29.224', # 'session': '43b0d28b299d491e88eb0966334f66d5', # 'uid': 1, # 'uri': '/youraccount/login?'}}, # '_http_ip': None, # '_https_ip': '128.141.29.224', # '_remember_me': False, # '_timeout': 172800} # def __init__(self): # pass for pid in pids: print 'Updating cache for pid: ', str(pid[0]) # iid = str(pid[0]) # ap = WebInterfaceAuthorPages(iid) # ap._update_cache(iid, ap._real__call__(blah(), {}, return_html=True)) # del ap os.system('wget -O - http://pcgssiz80.cern.ch/author/%s > /dev/null' % str(pid[0])) print ' ... done.' def get_cached_author_page(pageparam): ''' Return cached authorpage @param: pageparam (int personid) @return (id, 'authorpage_cache', personid, authorpage_html, date_cached) ''' #TABLE: id, tag, identifier, data, date caches = run_sql("select * from aidCACHE where object_name='authorpage_cache' and object_key=%s", (str(pageparam),)) if len(caches) >= 1: return caches[0] else: return [] def update_cached_author_page_timestamp(pageparam): ''' Updates cached author page timestamp @param pageparam: int personid ''' #TABLE: id, tag, identifier, data, date run_sql("update aidCACHE set last_updated=now() where object_name='authorpage_cache' and object_key=%s", (str(pageparam),)) def update_cached_author_page(pageparam, page): ''' Updates cached author page, deleting old caches for same pageparam @param pageparam: int personid @param page: string html authorpage ''' #TABLE: id, tag, identifier, data, date run_sql("delete from aidCACHE where object_name='authorpage_cache' and object_key=%s", (str(pageparam),)) run_sql("insert into aidCACHE values (Null,'authorpage_cache',%s,%s,now())", (str(pageparam), str(page))) def personid_remove_automatically_assigned_papers(pids=None): ''' Part of the person repair facility. Removes every person entity that has no prior human interaction. Will run on all person entities if pids == None @param pids: List of tuples of person IDs @type pids: list of tuples ''' if not pids: pids = run_sql("select distinct personid from aidPERSONID") for pid in pids: tickets = run_sql("select id from aidPERSONID where tag like 'rt%%' and personid=%s", (pid[0],)) pclaims = run_sql("select id from aidPERSONID where tag='paper' and (flag='2') and personid=%s", (pid[0],)) nclaims = run_sql("select id from aidPERSONID where tag='paper' and (flag='-2') and personid=%s", (pid[0],)) if len(tickets) > 0 and len(pclaims) == 0 and len(nclaims) == 0: continue elif len(tickets) == 0 and len(pclaims) == 0 and len(nclaims) == 0: run_sql("delete from aidPERSONID where personid=%s", (pid[0],)) elif len(pclaims) > 0: run_sql("delete from aidPERSONID where tag='paper' and (flag <> '-2' and flag <> '2') and personid=%s", (pid[0],)) elif len(nclaims) > 0: continue def _load_gender_firstnames_dict(files=''): if not files: files = {'boy': CFG_ETCDIR + '/bibauthorid/name_authority_files/male_firstnames.txt', 'girl': CFG_ETCDIR + '/bibauthorid/name_authority_files/female_firstnames.txt'} boyf = open(files['boy'], 'r') boyn = [x.strip().lower() for x in boyf.readlines()] boyf.close() girlf = open(files['girl'], 'r') girln = [x.strip().lower() for x in girlf.readlines()] girlf.close() return {'boys':boyn, 'girls':girln} def _load_firstname_variations(filename=''): #will load an array of arrays: [['rick','richard','dick'],['john','jhonny']] if not filename: filename = CFG_ETCDIR + '/bibauthorid/name_authority_files/name_variants.txt' retval = [] r = re.compile("\n") fp = open(filename) for l in fp.readlines(): lr = r.sub("", l) retval.append([clean_name_string(name.lower(), "", False, True) for name in lr.split(";") if name]) fp.close() return retval def personid_split_person_with_mixed_firstname_new(pids=None): gendernames = _load_gender_firstnames_dict() name_variations = _load_firstname_variations() if not pids: pids = run_sql("select distinct personid from aidPERSONID") for p in pids: pid = -1 if isinstance(p, list) or isinstance(p, tuple): pid = p[0] elif isinstance(p, int): pid = p else: print "The PID list is invalid." _perform_split_person_on_pid(pid, gendernames, name_variations) def _perform_split_person_on_pid(pid, gendernames, name_variations): print_plot_stats = False print_debug = True print_ignored = False perform_names_update_on_split = True new_clusters = {} new_cluster_index = 0 names = {} # bibref: name variants = {} # name: list of bibrefs for the same unified name variants_with_names = set() variants_paper_count = {} already_clustered = set() # find all papers, bibrefs and names for this person... papers = run_sql("select data from aidPERSONID where tag=%s and " "personid=%s", ('paper', str(pid))) bibrefs = {} for pap in papers: try: bibrefs[pap[0].split(',')[0]] += 1 except Exception: bibrefs[pap[0].split(',')[0]] = 1 for i in bibrefs.items(): name = run_sql("select Name from aidAUTHORNAMES where " "id = (select Name_id from aidAUTHORNAMESBIBREFS " "where bibref = %s)", (i[0],)) if len(name) > 0: if len(name[0][0]) > 2: names[i[0]] = name[0][0] for n in names.items(): if n[1] not in variants: variants[n[1]] = [n[0]] else: variants[n[1]].append(n[0]) for n in variants.items(): if len(split_name_parts(n[0])[2]) > 0: variants_with_names.add(n[0]) if len(variants_with_names) <= 1: #print ' Nothing to do here...' # 0..1 name only if print_ignored: print " |-- IGNORED #names < 2: %s" % variants_with_names return else: for variant_with_name in variants_with_names: if not variant_with_name in variants: continue for ref in variants[variant_with_name]: if variant_with_name in variants_paper_count: variants_paper_count[variant_with_name] += bibrefs[ref] else: variants_paper_count[variant_with_name] = bibrefs[ref] if print_debug: print "|-- Considering PID %s" % pid # Perform hierarchical decomposition... sorted_variants_paper_count = sorted(variants_paper_count.items(), key=itemgetter(1), reverse=True) for vpc_index, vpc in enumerate(sorted_variants_paper_count): if vpc_index in already_clustered: continue if print_debug: print " |-- HC Reference (%s): %s" % (vpc_index, vpc[0]) already_clustered.add(vpc_index) new_clusters[new_cluster_index] = [vpc_index] ref_name = vpc[0] for cmp_eidx, comp in enumerate(sorted_variants_paper_count[vpc_index + 1:]): cmp_idx = cmp_eidx + vpc_index + 1 if cmp_idx in already_clustered: continue if print_debug: print " |-- Compare to (%s): %s" % (cmp_idx, comp[0]) split = False name1 = split_name_parts(ref_name) name2 = split_name_parts(comp[0]) - composits_eq = names_are_equal_composites(name1, name2) - gender_eq = names_are_equal_gender(name1, name2, gendernames) - ldist = names_minimum_levenshtein_distance(name1, name2) - vars_eq = names_are_synonymous(name1, name2, name_variations) - substr_eq = names_are_substrings(name1, name2) + composits_eq = full_names_are_equal_composites(name1, name2) + gender_eq = full_names_are_equal_gender(name1, name2, gendernames) + ldist = full_names_minimum_levenshtein_distance(name1, name2) + vars_eq = full_names_are_synonymous(name1, name2, name_variations) + substr_eq = full_names_are_substrings(name1, name2) onames = name1[2] tnames = name2[2] oname = "".join(onames).lower() tname = "".join(tnames).lower() minlen = min(len(oname), len(tname)) maxlen = max(len(oname), len(tname)) if (not composits_eq and (not gender_eq or (((ldist == 1 and (float(ldist) / float(minlen) > .32)) or (ldist == 2 and (float(ldist) / float(minlen) > .39)) or ldist > 2) and not vars_eq and (not substr_eq and ldist > 1)))): split = True if print_plot_stats: if (not composits_eq and gender_eq and ldist > 0 and not vars_eq and not substr_eq): print("[%s,%s,%s] ... %s and %s on pid %s" % (ldist, (ldist / float(minlen)), (ldist / float(maxlen)), oname, tname, pid)) if print_debug: print " |-- Composites: %s" % composits_eq print " |-- Gender eq: %s" % gender_eq print " |-- L dist: %s" % ldist print " |-- Synonyms: %s" % vars_eq print " |-- Substr: %s" % substr_eq print " |-- Split: %s" % split if not split: if not new_cluster_index in new_clusters: new_clusters[new_cluster_index] = [] already_clustered.add(cmp_idx) new_clusters[new_cluster_index].append(cmp_idx) if print_debug: print(" |-- Adding idx %s to cluster %s" % (cmp_idx, new_cluster_index)) # else: # if print_plot_stats: # name1 = split_name_parts(ref_name) # name2 = split_name_parts(comp[0]) # onames = name1[2] # tnames = name2[2] # oname = "".join(onames).lower() # tname = "".join(tnames).lower() # minlen = min(len(oname), len(tname)) # maxlen = max(len(oname), len(tname)) # print("[%s,%s,%s] ... %s and %s on pid %s" # % (ldist, (ldist/float(minlen)), # (ldist/float(maxlen)), oname, tname, pid)) # print("for %s and %s on pid %s" % (oname, tname, pid)) new_cluster_index += 1 # perform the split into n new clusters... updated_pids = [] if print_debug: print " |-- Creating %s new PIDs..." % (len(new_clusters) - 1) for key in new_clusters: if key == 0: updated_pids.append(pid) if not print_debug: continue cluster_names = [] for member in new_clusters[key]: cluster_names.append(sorted_variants_paper_count[member][0]) if print_debug: print " |-- Will leave %s with pid %s" % (" and ".join(cluster_names), pid) continue cluster_names = [] split_bibrefs = set() for member in new_clusters[key]: cluster_names.append(sorted_variants_paper_count[member][0]) for cname in cluster_names: if cname in variants: for bref in variants[cname]: split_bibrefs.add(bref) if print_debug: print " |-- Will put %s in a new cluster" % (" and ".join(cluster_names)) print " |-- Creating new Person for bibrefs %s" % ", ".join(split_bibrefs) newpid = run_sql("select max(personid)+1 from aidPERSONID") lineids = [] for i in split_bibrefs: lines = run_sql("select id from aidPERSONID where personid=%s and " "data like %s", (str(pid), str(i) + '%')) for line in lines: if line[0] not in lineids: lineids.append(line[0]) lineids.append(-1) lineids.append(-1) s = ("update aidPERSONID set personid='%s' " "where flag < '2' and id in %s") updated_lines = run_sql(s, (newpid[0][0], lineids)) if updated_lines > 0: updated_pids.append(newpid[0][0]) if print_debug: if updated_lines > 0: print " |-- Done. New personid: %s" % newpid[0][0] else: print " |-- A human claim protected the split." if perform_names_update_on_split and len(updated_pids) > 1: if print_debug: print " |-- Updating names for updated person IDs" pidlist = [[i] for i in updated_pids] update_personID_names_string_set(pidlist) if print_debug: print " " # All done for this person... def personid_count_names_variants(pids=None): ''' Experimental code to output as html a display of persons with not 'so much' compatible names associated. #fixme: not finished yet ''' if not pids: pids = run_sql("select distinct personid from aidPERSONID") for p in pids: pid = p[0] #print '' #print 'Working on: ', pid, '
' s = '' + str(pid) + ': ' papers = run_sql("select data from aidPERSONID where tag=%s and personid=%s ", ('paper', str(pid))) bibrefs = {} for p in papers: try: bibrefs[p[0].split(',')[0]] += 1 except (KeyError, IndexError): bibrefs[p[0].split(',')[0]] = 1 #print "Found bibrefs: ", bibrefs.items(), '
' names = {} for i in bibrefs.items(): name = run_sql("select db_name from aidAUTHORNAMES where id = (select Name_id from aidAUTHORNAMESBIBREFS where bibref = %s)", (i[0],)) if len(name) > 0: names[i[0]] = name[0][0] else: pass #print 'skipping ', i #print 'Found names: ', names.items(), '
' nlist = [] for n in names.items(): if len(split_name_parts(n[1])[2]) > 0: nlist.append([n[1], bibrefs[n[0]]]) nlist = sorted(nlist, key=lambda k: k[1], reverse=True) #print 'Ordered list: ', nlist, '
' comparisons = [] for n in nlist[1:]: comparisons.append(compare_names(nlist[0][0], n[0])) #print 'Comparisons: ', comparisons, '
' namess = 'Name variants: \n' namevs = '' for i in names.items(): namess = namess + ' %s \n' % i[1] namevs = namevs + '%s; ' % i[1] s = s + namess + ' \n debug:' + '\n comparisons: ' + str(comparisons) + '\n nlist: ' + str(nlist) + '\n nlistbibrefs: ' + str(names.items()) try: c = min(comparisons) except (KeyError, IndexError): c = 1.0 g = str(hex(int(c * 255)))[2:] r = str(hex(int(255 - c * 255)))[2:] if len(g) < 2: g = '0' + g if len(r) < 2: r = '0' + r if c < 0.8: print '☐ -- ' % s + '' + '
%s' % (str(pid), str(namevs) ) + '

\n\n' print '' % s + '' def create_new_person(uid, uid_is_owner=False): ''' Create a new person. Set the uid as owner if requested. ''' pid = run_sql("select max(personid) from aidPERSONID")[0][0] if pid: try: pid = int(pid) except (ValueError, TypeError): pid = -1 pid += 1 if uid_is_owner: set_person_data(pid, 'uid', str(uid)) set_person_data(pid, 'user-created', str(uid)) else: set_person_data(pid, 'user-created', str(uid)) return pid def get_pid_from_name_bibrec(bibrecs, name_string): ''' Finds a Person ID for a specific name on a specific list of record IDs @param bibrecs: list of record IDs @type bibrecs: list of int @param name_string: the name of an author on the papers @type name_string: string @return: a Person ID @rtype: int ''' found_bibrecs = bibrecs surname = name_string bibrec_names = [] for b in found_bibrecs: bibrec_names.append([b, get_field_values_on_condition(b, source='API', get_table=['100', '700'], get_tag='a')]) for n in bibrec_names: for i in list(n[1]): if soft_compare_names(surname.encode('utf-8'), i.encode('utf-8')) < 0.4: if i in n[1]: n[1].remove(i) #bibrec_names = [[78, set([u'M\xfcck, W'])]] #what is left are only suitable names for each record. bibrefrecs = [] for bibrec in bibrec_names: for name in bibrec[1]: bibrefs = get_bibrefs_from_name_string(name.encode('utf-8')) if len(bibrefs) < 1: continue for bibref in bibrefs[0][0].split(','): bibrefrecs.append(str(bibref) + ',' + str(bibrec[0])) #bibrefrec = ['100:116,78', '700:505,78'] brr = [[i] for i in bibrefrecs] possible_persons = get_possible_personids_from_paperlist(brr) #[[0L, ['700:316,10']]] possible_persons = sorted(possible_persons, key=lambda k: len(k[1])) return possible_persons def get_person_id_from_canonical_id(canonical_id): ''' Finds the person id from a canonical name (e.g. Ellis_J_R_1) @param canonical_id: the canonical ID @type canonical_id: string @return: sql result of the request @rtype: tuple of tuple ''' return run_sql("SELECT personid FROM aidPERSONID WHERE " "tag='canonical_name' AND data = %s", (canonical_id,)) def get_canonical_id_from_personid(pid): ''' Finds the person id canonical name (e.g. Ellis_J_R_1) @param pid @type int @return: sql result of the request @rtype: tuple of tuple ''' return run_sql("SELECT data FROM aidPERSONID WHERE " "tag='canonical_name' AND personid = %s", (str(pid),)) def get_persons_with_open_tickets_list(): ''' Finds all the persons with open tickets and returns pids and count of tickets @return: [[pid, ticket_count]] ''' try: return run_sql("select o.personid, count(distinct o.flag) from " "aidPERSONID o use index (`ptf-b`), " "(select distinct i.personid as iid from aidPERSONID i " "use index (`ptf-b`) where tag like 'rt_%') as dummy " "WHERE tag like 'rt_%' AND o.personid = dummy.iid " "group by o.personid") except (OperationalError, ProgrammingError): return run_sql("select o.personid, count(distinct o.flag) from " "aidPERSONID o, " "(select distinct i.personid as iid from aidPERSONID i " "where tag like 'rt_%') as dummy " "WHERE tag like 'rt_%' AND o.personid = dummy.iid " "group by o.personid") # try: # return run_sql('select personid,count(distinct(flag)) from aidPERSONID use index (`ptf-b`)' # 'where personid in (select distinct personid from aidPERSONID use index (`ptf-b`) ' # 'where tag like "rt_%") and tag like "rt_%" group by personid ') # except (OperationalError, ProgrammingError): # return run_sql('select personid,count(distinct(flag)) from aidPERSONID ' # 'where personid in (select distinct personid from aidPERSONID ' # 'where tag like "rt_%") and tag like "rt_%" group by personid ') def get_request_ticket(person_id, matching=None, ticket_id=None): ''' Retrieves one or many requests tickets from a person @param: person_id: person id integer @param: matching: couple of values to match ('tag', 'value') @param: ticket_id: ticket id (flag) value @returns: [[[('tag', 'value')], ticket_id]] [[[('a', 'va'), ('b', 'vb')], 1L], [[('b', 'daOEIaoe'), ('a', 'caaoOUIe')], 2L]] ''' use_index = True tickets = [] if ticket_id: rows = [] try: rows = [run_sql("select tag,data,flag from aidPERSONID use index (`ptf-b`) where tag like %s and personid=%s and flag=%s", ('rt_%', str(person_id), str(ticket_id)))] except (ProgrammingError, OperationalError): rows = [run_sql("select tag,data,flag from aidPERSONID where tag like %s and personid=%s and flag=%s", ('rt_%', str(person_id), str(ticket_id)))] use_index = False if len(rows) < 1: return [] else: rows = [] ids = [] if use_index: if not matching: ids = run_sql("select distinct flag from aidPERSONID use index (`ptf-b`) where personid=%s and tag like %s", (str(person_id), 'rt_%')) else: ids = run_sql("select distinct flag from aidPERSONID use index (`tdf-b`) where tag=%s and data=%s and personid=%s", ('rt_' + str(matching[0]), str(matching[1]), str(person_id))) else: if not matching: ids = run_sql("select distinct flag from aidPERSONID where personid=%s and tag like %s", (str(person_id), 'rt_%')) else: ids = run_sql("select distinct flag from aidPERSONID where tag=%s and data=%s and personid=%s", ('rt_' + str(matching[0]), str(matching[1]), str(person_id))) for tid in ids: if use_index: rows.append(run_sql("select tag,data,flag from aidPERSONID use index (`ptf-b`) where tag like %s and personid=%s and flag = %s", ('rt_%', str(person_id), str(tid[0])))) else: rows.append(run_sql("select tag,data,flag from aidPERSONID where tag like %s and personid=%s and flag = %s", ('rt_%', str(person_id), str(tid[0])))) for row in rows: ticket = [] for line in row: ticket.append((line[0][3:], line[1])) try: tickets.append([ticket, row[0][2]]) except IndexError: pass return tickets def update_request_ticket(person_id, tag_data_tuple, ticket_id=None): ''' Creates / updates a request ticket for a personID @param: personid int @param: tag_data_tuples 'image' of the ticket: (('paper', '700:316,10'), ('owner', 'admin'), ('external_id', 'ticket_18')) @return: ticketid ''' #tags: rt_owner (the owner of the ticket, associating the rt_number to the transaction) # rt_external_id # rt_paper_cornfirm, rt_paper_reject, rt_paper_forget, rt_name, rt_email, rt_whatever #flag: rt_number if not ticket_id: last_id = [] try: last_id = run_sql("select max(flag) from aidPERSONID use index (`ptf-b`) where personid=%s and tag like %s", (str(person_id), 'rt_%'))[0][0] except (OperationalError, ProgrammingError): last_id = run_sql("select max(flag) from aidPERSONID where personid=%s and tag like %s", (str(person_id), 'rt_%'))[0][0] if last_id: ticket_id = last_id + 1 else: ticket_id = 1 delete_request_ticket(person_id, ticket_id) for d in tag_data_tuple: run_sql("insert into aidPERSONID (personid,tag,data,flag) values (%s,%s,%s,%s)", (str(person_id), 'rt_' + str(d[0]), str(d[1]), str(ticket_id))) return ticket_id def delete_request_ticket(person_id, ticket_id=None): ''' Removes a ticket from a person_id. If ticket_id is not provider removes all the tickets pending on a person. ''' if ticket_id: run_sql("delete from aidPERSONID where personid=%s and tag like %s and flag =%s", (str(person_id), 'rt_%', str(ticket_id))) else: run_sql("delete from aidPERSONID where personid=%s and tag like %s", (str(person_id), 'rt_%')) def update_personID_canonical_names(persons_list=None, overwrite=False, suggested=''): ''' Updates the personID table creating or updating canonical names for persons @param: persons_list: persons to consider for the update (('1'),) @param: overwrite: if to touch already existing canonical names @param: suggested: string to suggest a canonical name for the person ''' use_index = True if not persons_list: persons_list = run_sql('select distinct personid from aidPERSONID') for pid in persons_list: current_canonical = "" try: current_canonical = run_sql("select data from aidPERSONID use index (`ptf-b`) where personid=%s and tag=%s", (pid[0], 'canonical_name')) except (ProgrammingError, OperationalError): current_canonical = run_sql("select data from aidPERSONID where personid=%s and tag=%s", (pid[0], 'canonical_name')) use_index = False if (not overwrite) and len(current_canonical) > 0: continue else: names = [] if use_index: names = run_sql("select data,flag from aidPERSONID use index (`ptf-b`) where personid=%s and tag=%s", (pid[0], 'gathered_name')) else: names = run_sql("select data,flag from aidPERSONID where personid=%s and tag=%s", (pid[0], 'gathered_name')) names = sorted(names, key=lambda k: k[1], reverse=True) if len(names) < 1 and not suggested: continue else: if suggested: canonical_name = suggested else: canonical_name = create_canonical_name(names[0][0]) run_sql("delete from aidPERSONID where personid=%s and tag=%s", (pid[0], 'canonical_name')) existing_cnames = [] if use_index: existing_cnames = run_sql("select data from aidPERSONID use index (`tdf-b`) where tag=%s and data like %s", ('canonical_name', str(canonical_name) + '%')) else: existing_cnames = run_sql("select data from aidPERSONID where tag=%s and data like %s", ('canonical_name', str(canonical_name) + '%')) max_idx = 0 for i in existing_cnames: this_cid = 0 if i[0].count("."): this_cid = i[0].split(".")[-1] max_idx = max(max_idx, int(this_cid)) canonical_name = canonical_name + '.' + str(max_idx + 1) run_sql("insert into aidPERSONID (personid,tag,data) values (%s,%s,%s) ", (pid[0], 'canonical_name', canonical_name)) def update_personID_table_from_paper(papers_list=None, personid=None): ''' Updates the personID table removing the bibrec / bibrefs couples no longer existing (after a paper has been updated (name changed)) @param papers_list: list of papers to consider for the update (bibrecs) (('1'),) @param type papers_list: tuple/list of tuples/lists of integers/strings which represent integers @param personid: limit to given personid (('1',),) @param type personid: tuple/list of tuples/lists of integers/strings which represent integers @return: None ''' def extract_bibrec(paper): ''' Extracts bibrec from a record like 100:312,53. In the given example the function will return 53. ''' try: - return paper[0].split(',')[1] + return paper.split(',')[1] except IndexError: - return paper[0] + return paper def list_2_SQL_str(items, f): """ Concatenates all items in items to a sql string using f. @param items: a set of items @param type items: X @param f: a function which transforms each item from items to string @param type f: X:->str @return: "(x1, x2, x3, ... xn)" for xi in items @return type: string """ strs = tuple("%s, " % (f(x)) for x in items) concat = "".join(strs) return "(%s)" % concat[0:len(concat) - 2] def collect_person_id_data(select, index, where, person, limit=""): """ Runs a sql query whit the arguments above. If the index is not found the function ignores it. """ - if personid: + if person: try: query = "%s %s %s and personid in %s %s" % (select, index, where, person, limit) return run_sql(query) except (ProgrammingError, OperationalError): query = "%s %s and personid in %s %s" % (select, where, person, limit) return run_sql(query) else: try: query = "%s %s %s %s" % (select, index, where, limit) return run_sql(query) except (ProgrammingError, OperationalError): query = "%s %s %s" % (select, where, limit) return run_sql(query) class Worker(Thread): def __init__(self, q): Thread.__init__(self) self.q = q - def run(self): while self.q.empty() == False: self.paper = self.q.get() self.check_paper() self.q.task_done() - def check_paper(self): if bconfig.TABLES_UTILS_DEBUG: print " -> processing paper = %s" % (self.paper[0],) fullbibrefs100 = run_sql("select id_bibxxx from bibrec_bib10x where id_bibrec=%s", (self.paper[0],)) if len(fullbibrefs100) > 0: fullbibrefs100str = list_2_SQL_str(fullbibrefs100, lambda x: str(x[0])) bibrefs100 = run_sql("select id from bib10x where tag='100__a' and id in %s" % (fullbibrefs100str,)) else: bibrefs100 = () fullbibrefs700 = run_sql("select id_bibxxx from bibrec_bib70x where id_bibrec=%s", (self.paper[0],)) if len(fullbibrefs700) > 0: fullbibrefs700str = list_2_SQL_str(fullbibrefs700, lambda x: str(x[0])) bibrefs700 = run_sql("select id from bib70x where tag='700__a' and id in %s" % (fullbibrefs700str,)) else: bibrefs700 = () bibrecreflist = frozenset(["100:%s,%s" % (str(i[0]), self.paper[0]) for i in bibrefs100] + ["700:%s,%s" % (str(i[0]), self.paper[0]) for i in bibrefs700]) pid_rows_lazy = None #finally, if a bibrec/ref pair is in the authornames table but not in this list that name of that paper #is no longer existing and must be removed from the table. The new one will be addedd by the #update procedure in future; this entry will be risky becouse the garbage collector may #decide to kill the bibref in the bibX0x table for row in self.paper[1]: if row[3] not in bibrecreflist: if not pid_rows_lazy: pid_rows_lazy = collect_person_id_data(select="select id, personid, tag, data, flag, lcul from aidPERSONID", index="use index (`tdf-b`,`ptf-b`)", where="where tag='paper' and data like '%%,%s'" % (self.paper[0],), person=personid_q) - other_bibrefs = [b[3] for b in pid_rows_lazy if b[1] == row[1] and b[3] != row[3]] + other_bibrefs = [b[0] for b in pid_rows_lazy if b[1] == row[1] and b[3] != row[3]] run_sql("delete from aidPERSONID where id = %s", (row[0],)) if bconfig.TABLES_UTILS_DEBUG: print "* deleting record with missing bibref: id = %s, personid = %s, tag = %s, data = %s, flag = %s, lcul = %s" % row + print "found %d other records with the same personid and bibrec" % len(other_bibrefs) if len(other_bibrefs) == 1: #we have one and only one sobstitute, we can switch them! - run_sql("update aidPERSONID set flag=%s,lcul=%s where id=%s", (str(row[4]), str(row[5]), str(other_bibrefs[0][0]))) + run_sql("update aidPERSONID set flag=%s,lcul=%s where id=%s", (str(row[4]), str(row[5]), str(other_bibrefs[0]))) + if bconfig.TABLES_UTILS_DEBUG: + print "updating id=%d with flag=%d,lcul=%d" % (other_bibrefs[0], row[4], row[5]) persons_to_update = set([(p[1],) for p in self.paper[1]]) update_personID_canonical_names(persons_to_update) + if papers_list: + papers_list = frozenset([int(x[0]) for x in papers_list]) deleted_recs = run_sql("select o.id_bibrec from bibrec_bib98x o, \ (select i.id as iid from bib98x i \ where value = 'DELETED' \ and tag like '980__a') as dummy \ where o.id_bibxxx = dummy.iid") deleted_recs = frozenset(x[0] for x in deleted_recs) if bconfig.TABLES_UTILS_DEBUG: print "%d total deleted papers" % (len(deleted_recs),) if personid: personid_q = list_2_SQL_str(personid, lambda x: str(x[0])) else: personid_q = None - if papers_list: - if not (isinstance(papers_list, set) or isinstance(papers_list, frozenset)): - papers_list = frozenset(papers_list) - counter = 0 - rows_limit = 1000000 + rows_limit = 10000000 end_loop = False while not end_loop: papers_data = collect_person_id_data(select="select id, personid, tag, data, flag, lcul from aidPERSONID", index="use index (`ptf-b`)", where="where tag='paper'", person=personid_q, limit="limit %d, %d" % (counter, rows_limit,)) if bconfig.TABLES_UTILS_DEBUG: print "query with limit %d %d" % (counter, rows_limit) if len(papers_data) == rows_limit: counter += rows_limit else: end_loop = True papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data) to_remove = set() jobs = dict() for p in papers_data: if int(p[0]) in deleted_recs: to_remove.add(p[1][0]) - elif not papers_list or p[0] in papers_list: + elif not papers_list or int(p[0]) in papers_list: jobs[p[0]] = jobs.get(p[0], []) + [p[1]] del(papers_data) if len(to_remove) > 0: delstr = list_2_SQL_str(to_remove, lambda x: str(x)) delta = run_sql("delete from aidPERSONID where tag='paper' and id in %s" % (delstr,)) counter -= delta if bconfig.TABLES_UTILS_DEBUG: print "* deleting %d papers, from %d, marked as deleted: %s" % (delta, len(to_remove), delstr) jobslist = Queue() for p in jobs.items(): jobslist.put(p) for i in range(bconfig.CFG_BIBAUTHORID_MAX_PROCESSES): t = Worker(jobslist) t.daemon = True t.start() jobslist.join() def personid_perform_cleanup(): ''' Performs a consistency cleanup on the data in personID tables. It is usually not needed to have papers manually assigned to a personID to be even rejected from a different personID. This method thus takes care of eliminating such a redudancy in the table where it happens. It's not done during the update process for speed reasons. ''' #consistency check: #papers which have been assigned by users should appear in only one place #This will no longer be needed if the update_from_algorithm will be modified #to take that into account, now it is not for performance reasons run_sql("delete from aidPERSONID where tag='paper' and flag <='-1' and \ data in (select data from aidPERSONID where tag='paper' and flag='2')") update_personID_canonical_names() def confirm_papers_to_person(pid, papers, user_level=0): ''' Confirms the relationship between pid and paper, as from user input. @param pid: id of the person @type pid: ('2',) @param papers: list of papers to confirm @type papers: (('100:7531,9024',),) ''' #expects a pid ('2',) #and a lst of papers (('100:7531,9024',),) # class names_gatherer(Thread): # def __init__(self, pid): # Thread.__init__(self) # self.pid = pid # # def run(self): # update_personID_names_string_set(self.pid) # close_connection() updated_pids = [] for p in papers: old_owners = [] try: old_owners = run_sql("select personid from aidPERSONID use index (`tdf-b`) where tag=%s and data=%s", ('paper', str(p[0]),)) except (OperationalError, ProgrammingError): old_owners = run_sql("select personid from aidPERSONID where tag=%s and data=%s", ('paper', str(p[0]),)) if len(old_owners) > 0: for owner in old_owners: updated_pids.append((str(owner[0]),)) run_sql("delete from aidPERSONID where tag=%s and data=%s", ('paper', str(p[0]),)) run_sql("insert into aidPERSONID (PersonID, tag, data, flag, lcul) values (%s,'paper',%s,'2', %s)", (str(pid[0]), str(p[0]), user_level)) update_personID_names_string_set((pid,)) #upd_thread = names_gatherer(tuple(updated_pids)) #upd_thread.start() update_personID_names_string_set(tuple(updated_pids)) update_personID_canonical_names([pid]) def reject_papers_from_person(pid, papers, user_level=0): ''' Confirms the negative relationship between pid and paper, as from user input. @param pid: id of the person @type pid: ('2',) @param papers: list of papers to confirm @type papers: (('100:7531,9024',),) ''' #expects a pid ('2',) #and a lst of papers (('100:7531,9024',),) #check if already assigned by user and skip those ones for p in papers: run_sql("update aidPERSONID set flag=%s,lcul=%s where PersonID=%s and data=%s", ('-2', user_level, str(pid[0]), str(p[0]))) update_personID_names_string_set((pid,)) update_personID_canonical_names([pid]) def reset_papers_flag(pid, papers): ''' Resets the flag associated to the papers to '0' @param papers: list of papers to confirm @type papers: (('100:7531,9024',),) ''' for p in papers: run_sql("update aidPERSONID set flag='0',lcul='0' where tag=%s and data=%s", ('paper', str(p[0]))) update_personID_names_string_set((pid,)) update_personID_canonical_names() def get_papers_status(papers): ''' Gets the personID and flag assiciated to papers @param papers: list of papers @type papers: (('100:7531,9024',),) @return: (('data','personID','flag',),) @rtype: tuple of tuples ''' #lst of papers (('100:7531,9024',),) #for each paper gives: personid, assignment status papersstr = '( ' for p in papers: papersstr += '\'' + str(p[0]) + '\',' papersstr = papersstr[0:len(papersstr) - 1] + ' )' if len(papers) >= 1: ret_val = [] try: ret_val = run_sql("select data,PersonID,flag from aidPERSONID use index (`tdf-b`) where tag=%s and data in " + papersstr, ('paper',)) except (ProgrammingError, OperationalError): ret_val = run_sql("select data,PersonID,flag from aidPERSONID where tag=%s and data in " + papersstr, ('paper',)) return ret_val else: return [] def get_person_papers(pid, flag, show_author_name=False, show_title=False, show_rt_status=False): ''' Returns all the paper associated to a person with a flag greater or equal than the given one. Eventually returns even author name and title associated to the papers. @param pid: person id @type pid: ('2',) @param flag: numerical flag, the convention is documented with the database table creation script @type papers: integer @param show_author_name: Also return authorname in dict? @type show_author_name: Boolean @param show_title: Also return title in dict? @type show_title: Boolean @param show_rt_status: Also return if this paper is currently mentioned in a ticket to be reviewed by an operator. @return: [{'data': String, 'flag': Int, 'author_name': String, 'title': String, 'rt_status': Boolean}] author_name and title will be returned depending on the params @rtype: list of dicts ''' #expects a pid ('2',) #and a flag 0 try: from invenio.search_engine import get_record except ImportError: return [] paperslist = [] docs = [] try: flag = int(flag) except ValueError: return paperslist try: docs = run_sql("SELECT data,flag FROM aidPERSONID use index (`ptf-b`) " "where personid = %s " "and tag = %s and flag >= %s", (pid[0], 'paper', flag)) except (ProgrammingError, OperationalError): docs = run_sql("SELECT data,flag FROM aidPERSONID where personid = %s" " and tag = %s and flag >= %s", (pid[0], 'paper', flag)) for doc in docs: listdict = {} if show_title: title = "No title on paper..." try: rec_id = int(doc[0].split(',')[1]) title = get_record(rec_id)['245'][0][0][0][1] except (IndexError, KeyError, ValueError): title = "Problem encountered while retrieving document title" listdict["title"] = title dsplit = doc[0].split(',') tnum = "70" if str(dsplit[0].split(':')[0]) == "100": tnum = "10" sqlstr = ("SELECT value FROM bib%sx WHERE id = " % (tnum)) + '%s' authorname = run_sql(sqlstr, (dsplit[0].split(':')[1],)) try: authorname = authorname[0][0] if show_author_name: listdict["authorname"] = authorname.decode("utf-8") except IndexError: #The paper has been modified and this bibref is no longer there #@TODO: this must call bibsched to update_personid_table_from_paper continue listdict["data"] = doc[0] listdict["flag"] = doc[1] if show_rt_status: rt_count = run_sql("SELECT count(*) FROM aidPERSONID WHERE " "tag like 'rt_%%' and data = %s", (doc[0],)) try: rt_count = int(rt_count[0][0]) except (IndexError, ValueError, TypeError): rt_count = 0 if rt_count > 0: listdict["rt_status"] = True else: listdict["rt_status"] = False paperslist.append(listdict) return paperslist def add_person_paper_needs_manual_review(pid, bibrec): ''' Adds to a person a paper which needs manual review before bibref assignment @param pid: personid, int @param bibrec: the bibrec, int ''' set_person_data(pid, 'paper_needs_bibref_manual_confirm', bibrec) def get_person_papers_to_be_manually_reviewed(pid): ''' Returns the set of papers awaiting for manual review for a person for bibref assignment @param pid: the personid, int ''' return get_person_data(pid, 'paper_needs_bibref_manual_confirm') def del_person_papers_needs_manual_review(pid, bibrec): ''' Deletes from the set of papers awaiting for manual review for a person @param pid: personid, int @param bibrec: the bibrec, int ''' del_person_data(pid, 'paper_needs_bibref_manual_confirm', bibrec) def get_person_data(person_id, tag=None): ''' Returns all the records associated to a person. If tag != None only rows for the selected tag will be returned. @param person_id: id of the person to read the attribute from @type person_id: int @param tag: the tag to read. Optional. Default: None @type tag: string @return: the data associated with a virtual author @rtype: tuple of tuples ''' rows = [] if tag: rows = run_sql("SELECT tag, data FROM aidPERSONID " "WHERE personid = %s AND tag = %s", (person_id, tag)) else: rows = run_sql("SELECT tag, data FROM aidPERSONID " "WHERE personid = %s", (person_id,)) return rows def set_person_data(person_id, tag, value, user_level=0): ''' Change the value associated to the given tag for a certain person. @param person_id: ID of the person @type person_id: int @param tag: tag to be updated @type tag: string @param value: value to be written for the tag @type value: string ''' current_tag_value = [] try: current_tag_value = run_sql("SELECT data FROM aidPERSONID use index (`ptf-b`) " "WHERE personid = %s AND tag = %s AND " "data = %s", (person_id, tag, value)) except (OperationalError, ProgrammingError): current_tag_value = run_sql("SELECT data FROM aidPERSONID " "WHERE personid = %s AND tag = %s AND " "data = %s", (person_id, tag, value)) if len(current_tag_value) > 0: run_sql("UPDATE aidPERSONID SET tag = %s, data = %s WHERE " "personid = %s AND tag = %s AND lcul = %s", (tag, value, person_id, tag, user_level)) else: run_sql("INSERT INTO aidPERSONID (`personid`, `tag`, `data`, `flag`, `lcul`) " "VALUES (%s, %s, %s, %s, %s);", (person_id, tag, value, '0', user_level)) update_personID_canonical_names([[person_id]]) def del_person_data(person_id, tag, value=None): ''' Change the value associated to the given tag for a certain person. @param person_id: ID of the person @type person_id: int @param tag: tag to be updated @type tag: string @param value: value to be written for the tag @type value: string ''' if not value: run_sql("delete from aidPERSONID where personid=%s and tag=%s", (person_id, tag)) else: run_sql("delete from aidPERSONID where personid=%s and tag=%s and data=%s", (person_id, tag, value)) def get_person_names_count(pid): ''' Returns the set of name strings and count associated to a person id @param pid: ID of the person @type pid: ('2',) @param value: value to be written for the tag @type value: string ''' ret_val = [] try: ret_val = run_sql("select data,flag from aidPERSONID use index (`ptf-b`) where PersonID=%s and tag=%s", (str(pid[0]), 'gathered_name',)) except (OperationalError, ProgrammingError): ret_val = run_sql("select data,flag from aidPERSONID where PersonID=%s and tag=%s", (str(pid[0]), 'gathered_name',)) return ret_val def get_person_db_names_count_old(pid): ''' Returns the set of name strings and count associated to a person id. The name strings are as found in the database. @param pid: ID of the person @type pid: ('2',) @param value: value to be written for the tag @type value: string ''' norm_names_count = [] try: norm_names_count = run_sql("select data,flag from aidPERSONID use index (`ptf-b`) " "where PersonID=%s and tag='gathered_name'", (str(pid[0]),)) except (OperationalError, ProgrammingError): norm_names_count = run_sql("select data,flag from aidPERSONID where " "PersonID=%s and tag='gathered_name'", (str(pid[0]),)) norm_names_count_dict = {} db_names_count_dict = {} db_names = get_person_names_set(pid) return_list = [] for name, count in norm_names_count: norm_names_count_dict[name] = count names_to_join = [] for name in norm_names_count_dict: names_to_join.append([[name], []]) for db_name in db_names: try: ndb_name = create_normalized_name(split_name_parts(db_name[0])) db_names_count_dict[db_name[0]] = norm_names_count_dict[ndb_name] for i in names_to_join: if ndb_name in i[0]: i[1].append(db_name[0]) except (KeyError): db_names_count_dict[db_name[0]] = 1 for nl in names_to_join: name_string = '' for n in nl[1]: name_string += '"' + str(n) + '" ' if len(nl[1]) < 1: name_string = '"' + str(nl[0][0]) + '" ' return_list.append((name_string, norm_names_count_dict[nl[0][0]])) # for name, count in db_names_count_dict.iteritems(): # return_list.append((name, count)) # return_list = sorted(return_list, key=lambda k: k[0], reverse=False) return tuple(return_list) def get_person_db_names_count(pid, sort_by_count=True): ''' Returns the set of name strings and count associated to a person id. The name strings are as found in the database. @param pid: ID of the person @type pid: ('2',) ''' docs = [] try: docs = run_sql("SELECT `data` FROM `aidPERSONID` use index (`ptf-b`) where PersonID=%s and tag=%s and flag>=%s", (str(pid[0]), 'paper', '-1',)) except (ProgrammingError, OperationalError): docs = run_sql("SELECT `data` FROM `aidPERSONID` where PersonID=%s and tag=%s and flag>=%s", (str(pid[0]), 'paper', '-1',)) authornames = {} for doc in docs: dsplit = doc[0].split(',') tnum = "70" if str(dsplit[0].split(':')[0]) == "100": tnum = "10" sqlstr = "SELECT value FROM bib%sx WHERE id = " % tnum + "%s" authorname = run_sql(sqlstr, (dsplit[0].split(':')[1],)) if len(authorname) > 0: if authorname[0][0] not in authornames: authornames[authorname[0][0]] = 1 else: authornames[authorname[0][0]] += 1 authornames = list(authornames.iteritems()) if sort_by_count: authornames = sorted(authornames, key=lambda k: k[0], reverse=False) return authornames def get_person_names_set(pid): ''' Returns the set of name strings associated to a person id @param pid: ID of the person @type pid: ('2',) @param value: value to be written for the tag @type value: string ''' docs = [] try: docs = run_sql("SELECT `data` FROM `aidPERSONID` use index (`ptf-b`) where PersonID=%s and tag=%s and flag>=%s", (str(pid[0]), 'paper', '-1',)) except (ProgrammingError, OperationalError): docs = run_sql("SELECT `data` FROM `aidPERSONID` where PersonID=%s and tag=%s and flag>=%s", (str(pid[0]), 'paper', '-1',)) authornames = set() for doc in docs: dsplit = doc[0].split(',') tnum = "70" if str(dsplit[0].split(':')[0]) == "100": tnum = "10" sqlstr = "SELECT value FROM bib%sx WHERE id = " % tnum + "%s" authorname = run_sql(sqlstr, (dsplit[0].split(':')[1],)) if len(authorname) > 0: authornames.add(authorname[0]) return list(authornames) def find_personIDs_by_name_string(namestring, strict=False): ''' Search engine to find persons matching the given string The matching is done on the surname first, and names if present. An ordered list (per compatibility) of pids and found names is returned. @param namestring: string name, 'surname, names I.' @type: string @param strict: Define if this shall perform an exact or a fuzzy match @type strict: boolean @return: pid list of lists [pid,[[name string, occur count, compatibility]]] ''' canonical = [] use_index = True try: canonical = run_sql("select personid,data from aidPERSONID use index (`tdf-b`) where data like %s and tag=%s", (namestring + '%', 'canonical_name')) except (ProgrammingError, OperationalError): canonical = run_sql("select personid,data from aidPERSONID where data like %s and tag=%s", (namestring + '%', 'canonical_name')) use_index = False namestring_parts = split_name_parts(namestring) # The following lines create the regexp used in the query. surname = clean_name_string(namestring_parts[0], # replacement=".{0,3}", replacement="%", keep_whitespace=False, trim_whitespaces=True) surname = surname + ',%' matching_pids_names_tuple = [] if use_index: matching_pids_names_tuple = run_sql("select o.personid, o.data, o.flag from aidPERSONID o use index (`ptf-b`), " "(select distinct i.personid as ipid from aidPERSONID i use index (`tdf-b`) where i.tag='gathered_name' and i.data like %s)" " as dummy where o.tag='gathered_name' and o.personid = dummy.ipid", (surname,)) # matching_pids_names_tuple = run_sql("select personid, data, flag from aidPERSONID use index (`ptf-b`) " # "where tag=\'gathered_name\' and personid in " # "(select distinct personid from aidPERSONID use index (`tdf-b`) " # "where tag=\'gathered_name\' and data like %s)", (surname,)) else: matching_pids_names_tuple = run_sql("select o.personid, o.data, o.flag from aidPERSONID o, " "(select distinct i.personid as ipid from aidPERSONID i where i.tag='gathered_name' and i.data like %s)" " as dummy where o.tag='gathered_name' and o.personid = dummy.ipid", (surname,)) # print matching_pids_names_tuple if len(matching_pids_names_tuple) == 0 and len(surname) >= 2: surname = surname[0:len(surname) - 2] + '%,%' if use_index: matching_pids_names_tuple = run_sql("select o.personid, o.data, o.flag from aidPERSONID o use index (`ptf-b`), " "(select distinct i.personid as ipid from aidPERSONID i use index (`tdf-b`) where i.tag='gathered_name' and i.data like %s)" " as dummy where o.tag='gathered_name' and o.personid = dummy.ipid", (surname,)) else: matching_pids_names_tuple = run_sql("select o.personid, o.data, o.flag from aidPERSONID o, " "(select distinct i.personid as ipid from aidPERSONID i where i.tag='gathered_name' and i.data like %s)" " as dummy where o.tag='gathered_name' and o.personid = dummy.ipid", (surname,)) if len(matching_pids_names_tuple) == 0 and len(surname) >= 2: surname = '%' + surname[0:len(surname) - 2] + '%,%' if use_index: matching_pids_names_tuple = run_sql("select o.personid, o.data, o.flag from aidPERSONID o use index (`ptf-b`), " "(select distinct i.personid as ipid from aidPERSONID i use index (`tdf-b`) where i.tag='gathered_name' and i.data like %s)" " as dummy where o.tag='gathered_name' and o.personid = dummy.ipid", (surname,)) else: matching_pids_names_tuple = run_sql("select o.personid, o.data, o.flag from aidPERSONID o, " "(select distinct i.personid as ipid from aidPERSONID i where i.tag='gathered_name' and i.data like %s)" " as dummy where o.tag='gathered_name' and o.personid = dummy.ipid", (surname,)) matching_pids = [] # print matching_pids_names_tuple for name in matching_pids_names_tuple: comparison = soft_compare_names(namestring, name[1]) matching_pids.append([name[0], name[1], name[2], comparison]) # matching_pids = sorted(matching_pids, key=lambda k: k[3], reverse=True) # print matching_pids persons = {} if len(canonical) > 0: for n in canonical: matching_pids.append([n[0], n[1], 1, 1]) for n in matching_pids: if n[3] >= 0.4: if n[0] not in persons: persons[n[0]] = sorted([[p[1], p[2], p[3]] for p in matching_pids if p[0] == n[0]], key=lambda k: k[2], reverse=True) # print persons porderedlist = [] for i in persons.iteritems(): porderedlist.append([i[0], i[1]]) porderedlist = sorted(porderedlist, key=lambda k: k[1][0][1], reverse=False) porderedlist = sorted(porderedlist, key=lambda k: k[1][0][0], reverse=False) porderedlist = sorted(porderedlist, key=lambda k: k[1][0][2], reverse=True) if strict and len(porderedlist) >= 1: return [porderedlist[0]] return porderedlist def update_personID_names_string_set(PIDlist=None): ''' Updates the personID table with the names gathered from documents @param: list of pids to consider, if omitted performs an update on the entire db @type: tuple of tuples Gets all the names associated to the bibref/bibrec couples of the person and builds a set of names, counting the occurrencies. The values are store in the gathered_name/flag fields of each person. The gathering of names is an expensive operation for the database (many joins), so the operation is threaded so to have as many parallell queries as possible. ''' if not PIDlist or len(PIDlist) == 0: PIDlist = run_sql('SELECT DISTINCT `personid` FROM `aidPERSONID`') class names_gatherer(Thread): def __init__ (self, pid): Thread.__init__(self) self.pid = pid self.pstr = '' self.person_papers = None self.namesdict = None self.needs_update = None self.current_namesdict = None self.pname = None def run(self): self.namesdict = dict() use_index = True try: self.person_papers = run_sql("select data from `aidPERSONID` use index (`ptf-b`) where tag=\'paper\' and " " flag >= \'-1\' and PersonID=%s", (str(self.pid[0]),)) except (OperationalError, ProgrammingError): self.person_papers = run_sql("select data from `aidPERSONID` where tag=\'paper\' and " " flag >= \'-1\' and PersonID=%s", (str(self.pid[0]),)) use_index = False for p in self.person_papers: self.pname = run_sql("select Name from aidAUTHORNAMES where id = " "(select Name_id from aidAUTHORNAMESBIBREFS where bibref = %s)", (str(p[0].split(',')[0]),)) if len(self.pname) > 0: if self.pname[0][0] not in self.namesdict: self.namesdict[self.pname[0][0]] = 1 else: self.namesdict[self.pname[0][0]] += 1 if use_index: self.current_namesdict = dict(run_sql("select data,flag from aidPERSONID use index (`ptf-b`) where personID=%s " "and tag=\'gathered_name\'", (str(self.pid[0]),))) else: self.current_namesdict = dict(run_sql("select data,flag from aidPERSONID where personID=%s " "and tag=\'gathered_name\'", (str(self.pid[0]),))) self.needs_update = False if self.current_namesdict != self.namesdict: self.needs_update = True else: for i in self.namesdict.iteritems(): if i[1] != self.current_namesdict[i[0]]: self.needs_update = True if bconfig.TABLES_UTILS_DEBUG: pass # sys.stdout.write(str(self.pid) + str(i[1]) + ' differs from ' + str(self.current_namesdict[i[0]])) # sys.stdout.flush() if self.needs_update: if bconfig.TABLES_UTILS_DEBUG: pass # sys.stdout.write(str(self.pid) + ' updating!') # sys.stdout.flush() run_sql("delete from `aidPERSONID` where PersonID=%s and tag=%s", (str(self.pid[0]), 'gathered_name')) for name in self.namesdict: if bconfig.TABLES_UTILS_DEBUG: #print 'insert into aidPERSONID (PersonID, tag, data, flag) values ('+ str(self.pid[0]) + ',\'gathered_name\',\"' + str(name)+ '\",\"' + str(self.namesdict[name]) + '\")' pass # self.pstr += ' ' + str(self.pid[0]) + ' ...processing: ' + str(name) + ' ' + str(self.namesdict[name]) # run_sql('insert into aidPERSONID (PersonID, tag, data, flag) values (' # + str(self.pid[0]) + ',\'gathered_name\',\"' + str(name) # + '\",\"' + str(self.namesdict[name]) + '\")') run_sql('insert into aidPERSONID (PersonID, tag, data, flag) values (%s,%s,%s,%s)', (str(self.pid[0]), 'gathered_name', str(name), str(self.namesdict[name]))) close_connection() # else: # sys.stdout.write(str(self.pid) + ' not updating!') # sys.stdout.flush() # sys.stdout.write(self.pstr + '\n') # sys.stdout.flush() tgath = [] for pid in PIDlist: current = names_gatherer(pid) tgath.append(current) current.start() if bconfig.TABLES_UTILS_DEBUG: sys.stdout.write(str(pid) + '.\n') sys.stdout.flush() while threading.activeCount() > bconfig.PERSONID_SQL_MAX_THREADS: time.sleep(0.02) for t in tgath: t.join() def update_personID_from_algorithm(RAlist=None): ''' Updates the personID table with the results of the algorithm, taking into account user inputs @param: list of realauthors to consider, if omitted performs an update on the entire db @type: tuple of tuples This is the core of the matching between the bibauthorid world and the personid world. For each RA of the list, tries to find the person it should be (in an ideal world there is 100% matching in the list of papers, and the association is trivial). In the real world an RA might be wrongly carrying papers of more then one person (or a person might have papers of more then one RAs) so the matching must be done on a best-effort basis: -find the most compatible person -if it's compatible enough, merge the person papers with the ra papers (after a backtracking to find all the other RAs which the person might 'contain') -if nobody is compatible enough create a new person with RA papers Given the fuzzy nature of both the computation of RAs and the matching with persons, it has been decided to stick to the person all and only the papers which are carried by the RAs over a certain threshold. ''' def get_bibreclist(currentRA): #[['700:157610,453095', '1.0']] VAlist = run_sql("SELECT `virtualauthorID`,`p` FROM `aidREALAUTHORS` WHERE `realauthorID`=%s", (str(currentRA[0]),)) bibreclist = [] class get_va_bibreclist(Thread): def __init__ (self, va): Thread.__init__(self) self.va = va self.bibreclist = [] def run(self): bibrec = run_sql("select value from aidVIRTUALAUTHORSDATA where virtualauthorID=%s and tag=%s ", (str(self.va[0]), 'bibrefrecpair')) if len(bibrec) > 0: self.bibreclist.append([str(bibrec[0][0]) , str(self.va[1])]) close_connection() # !!Important!! return if bconfig.TABLES_UTILS_DEBUG: pass #print ' --debug: thread spawn for bibreclist of va: ' + str(self.va) bibrec = dict(run_sql("SELECT `tag`,`value` FROM `aidVIRTUALAUTHORSDATA` WHERE " "virtualauthorID=%s and (tag=%s or tag=%s)", (str(self.va[0]), 'bibrec_id', 'orig_authorname_id'))) if (not bibrec.has_key("orig_authorname_id")) or (not bibrec.has_key("bibrec_id")): if bconfig.TABLES_UTILS_DEBUG: print ("WARNING: VA %s holds no data." % self.va[0]) return bibreflist = run_sql("SELECT `bibrefs` FROM `aidAUTHORNAMES` WHERE `id`=%s", (str(bibrec['orig_authorname_id']),)) bibreflist = bibreflist[0][0].split(',') bibref100string = '(' bibref700string = '(' for br in bibreflist: if br.split(':')[0] == '100': bibref100string += '\'' + br.split(':')[1] + '\',' else: bibref700string += '\'' + br.split(':')[1] + '\',' if bibref100string[len(bibref100string) - 1] == ',': bibref100string = bibref100string[0:len(bibref100string) - 1] + ')' else: bibref100string = '' if bibref700string[len(bibref700string) - 1] == ',': bibref700string = bibref700string[0:len(bibref700string) - 1] + ')' else: bibref700string = '' if bibref100string: bibrec100list = run_sql("SELECT `id_bibxxx` FROM `bibrec_bib10x` WHERE `id_bibrec`=%s" " and `id_bibxxx` in " + bibref100string, (str(bibrec['bibrec_id']),)) else: bibrec100list = [] if bibref700string: bibrec700list = run_sql("SELECT `id_bibxxx` FROM `bibrec_bib70x` WHERE `id_bibrec`=%s" " and `id_bibxxx` in" + bibref700string, (str(bibrec['bibrec_id']),)) else: bibrec700list = [] for br in bibreflist: if (long(br.split(':')[1]),) in bibrec100list: if br not in self.bibreclist: self.bibreclist.append([br + ',' + bibrec['bibrec_id'] , str(self.va[1])]) break elif (long(br.split(':')[1]),) in bibrec700list: if br not in self.bibreclist: self.bibreclist.append([br + ',' + bibrec['bibrec_id'] , str(self.va[1])]) break close_connection() tbibreclist = [] if bconfig.TABLES_UTILS_DEBUG: print 'update_personID_from_algorithm: get_bibreclist threads: ' for va in VAlist: tempbibreclist = [] bibrec = run_sql("select value from aidVIRTUALAUTHORSDATA where virtualauthorID=%s and tag=%s ", (str(va[0]), 'bibrefrecpair')) if len(bibrec) > 0: tempbibreclist.append([str(bibrec[0][0]) , str(va[1])]) for b in tempbibreclist: if b not in bibreclist: bibreclist.append(b) else: current = get_va_bibreclist(va) tbibreclist.append(current) if bconfig.TABLES_UTILS_DEBUG: sys.stdout.write('.') sys.stdout.flush() current.start() while threading.activeCount() > bconfig.PERSONID_SQL_MAX_THREADS: time.sleep(0.02) for t in tbibreclist: t.join() for b in t.bibreclist: if b not in bibreclist: bibreclist.append(b) if bconfig.TABLES_UTILS_DEBUG: print '\nupdate_personID_from_algorithm: get_bibreclist ---------------- Considering RA: ' + str(currentRA) return bibreclist def create_new_person_from_bibreclist(bibreclist): #creating new personid PID = max(run_sql('SELECT MAX(PersonID) FROM `aidPERSONID`')[0][0], -1) + 1 SQLquery = '' for br in bibreclist: flag = 0 if br[1] >= bconfig.PERSONID_CNP_FLAG_1: flag = 1 elif br[1] < bconfig.PERSONID_CNP_FLAG_MINUS1: flag = -1 SQLquery += ('insert into `aidPERSONID` (PersonID, tag, data, flag) values (' + str(PID) + ', \'paper\',%s,\'' + str(flag) + '\');') % ('\'' + br[0] + '\'') if SQLquery: run_sql(SQLquery) update_personID_names_string_set(((str(PID),),)) if bconfig.TABLES_UTILS_DEBUG: print 'update_personID_from_algorithm: create_new_person ---------------- ' + str(PID) def get_person_ra(person_papers): inverse_ra_list = [] papers_vas = [] class get_va_from_paper(Thread): def __init__ (self, paper): Thread.__init__(self) self.paper = paper self.vas = [] self.authnameid = None def run(self): vaids = run_sql("select virtualauthorID from aidVIRTUALAUTHORSDATA where tag='bibrefrecpair' and value=%s", (paper[0],)) if vaids: for vaid in vaids: self.vas.append(vaid[0]) close_connection() return else: self.authnameid = run_sql("select Name_id from aidAUTHORNAMESBIBREFS where bibref=%s", (str(self.paper[0].split(',')[0]),)) if len(self.authnameid) > 0: self.va = run_sql( "select a.virtualauthorID from aidVIRTUALAUTHORSDATA as a inner join " "aidVIRTUALAUTHORSDATA as b on a.virtualauthorID=b.virtualauthorID " "where ((a.tag=%s and a.value=%s) and (b.tag=%s and b.value=%s))", ('bibrec_id', str(self.paper[0].split(',')[1]), 'orig_authorname_id', str(self.authnameid[0][0]))) #This is left here for benchmarking, it is still not clear which approach is the fastest #self.va = run_sql('select virtualauthorID from `aidVIRTUALAUTHORSDATA` where ( virtualauthorID in (' # + ('select virtualauthorID from `aidVIRTUALAUTHORSDATA` where tag=\'bibrec_id\' and value=\'%s\'') # % (str(self.paper[0].split(',')[1])) # + ')) and ((tag, value) = (\'orig_authorname_id\', \'' # + str(authnameid[0][0]) + '\'))') for i in self.va: self.vas.append(i[0]) close_connection() tvapaper = [] if bconfig.TABLES_UTILS_DEBUG: print 'update_personID_from_algorithm: get_va_from_paper threads: ' for paper in person_papers: current = get_va_from_paper(paper) tvapaper.append(current) if bconfig.TABLES_UTILS_DEBUG: sys.stdout.write('.') sys.stdout.flush() current.start() while threading.activeCount() > bconfig.PERSONID_SQL_MAX_THREADS: time.sleep(0.02) for t in tvapaper: t.join() for b in t.vas: if b not in papers_vas: papers_vas.append(b) papers_vas_string = '( ' for i in papers_vas: papers_vas_string += '\'' + str(i) + '\',' papers_vas_string = papers_vas_string[0:len(papers_vas_string) - 1] + ' )' if len(papers_vas) >= 1: r = run_sql("select distinct `realauthorID` " " from `aidREALAUTHORS` where virtualauthorID in " + papers_vas_string) if len(r) > 0: inverse_ra_list.append(r) else: inverse_ra_list = [] if bconfig.TABLES_UTILS_DEBUG: print '\nupdate_personID_from_algorithm: get_person_ra ---------------- on ' + str(person_papers) return inverse_ra_list def merge_update_person_with_ra(pids, person_paper_list, currentRA, bibreclist): ras = get_person_ra(person_paper_list) # bibrecslists = [] bibrecset = set() person_rejected_papers = run_sql("select data from `aidPERSONID` where " ' tag=%s and flag=%s and PersonID=%s', ('paper', '-2', str(pids[0]))) person_confirmed_papers = run_sql("select data from `aidPERSONID` where " ' tag=%s and flag=%s and PersonID=%s', ('paper', '2', str(pids[0]))) person_rejected_papers_set = set() for paper in person_rejected_papers: person_rejected_papers_set.add(paper[0]) person_confirmed_papers_set = set() for paper in person_confirmed_papers: person_confirmed_papers_set.add(paper[0]) for ra in ras: blist = get_bibreclist(ra[0]) # bibrecslists.append(blist) for doc in blist: if doc[1] >= bconfig.PERSONID_MIN_P_FROM_BCTKD_RA: bibrecset.add(doc[0]) for doc in bibreclist: if doc[1] >= bconfig.PERSONID_MIN_P_FROM_NEW_RA: bibrecset.add(doc[0]) person_paper_set = set() for paper in person_paper_list: person_paper_set.add(paper[0]) p_to_add = bibrecset.difference(person_paper_set) p_to_add = p_to_add.difference(person_rejected_papers_set) p_to_add = p_to_add.difference(person_confirmed_papers_set) # we might consider the case in which the algorithm is clustering two papers which are # manually assigned to different persons. That would mean adding a potentially really slow query # and once tthe algorithm will be able to take into consideration user input logs that should never happen # so this will be left to be done once we will see if it is really necessary to slow down everything # when the algorithm is clustering nicely this shouldn't happen anyway p_to_remove = person_paper_set.difference(bibrecset) p_to_remove = p_to_remove.difference(person_confirmed_papers_set) p_to_remove = p_to_remove.difference(person_rejected_papers_set) SQLquery = '' for br in p_to_add: SQLquery += ('insert into `aidPERSONID` (PersonID, tag, data, flag) values (' + str(pids[0]) + ', \'paper\',%s,\'0\');') % ('\'' + br + '\'') if SQLquery: run_sql(SQLquery) SQLquery = '' for br in p_to_remove: SQLquery += ('delete from `aidPERSONID` where PersonID=\'' + str(pids[0]) + '\' and tag=\'paper\' and data=\'' + str(br) + '\';') if SQLquery: run_sql(SQLquery) update_personID_names_string_set((pids,)) if bconfig.TABLES_UTILS_DEBUG: print 'update_personID_from_algorithm: Merging ----------------' + str(pids) + ' with realauthor ' + str(currentRA) + ' and found ras ' + str(ras) # print 'adding ' + str(p_to_add) # print 'removing ' + str(p_to_remove) if not RAlist or len(RAlist) == 0: RAlist = run_sql('SELECT DISTINCT `realauthorID` FROM `aidREALAUTHORS`')# LIMIT 1 , 15') for currentRA in RAlist: if bconfig.TABLES_UTILS_DEBUG: print '---|||||--- considering RA ' + str(currentRA) #bibreclist is the list of bibrefs associated with a RA bibreclist = get_bibreclist(currentRA) if not bibreclist: if bconfig.TABLES_UTILS_DEBUG: print "update_personID_from_algorithm: Skipping RA. Got no data from VA." continue bibrecsqlstring = '( ' for i in bibreclist: bibrecsqlstring += '\'' + str(i[0]) + '\',' bibrecsqlstring = bibrecsqlstring[0:(len(bibrecsqlstring) - 1)] + ' )' sqlstr = "SELECT DISTINCT PersonID FROM `aidPERSONID` WHERE tag=%s and `flag` >= %s and `data` in " + bibrecsqlstring if len(bibreclist) >= 1: pids = run_sql(sqlstr, ('paper', '0')) else: pids = [] if bconfig.TABLES_UTILS_DEBUG: print 'update_personID_from_algorithm: Possible PIDS: ' + str(pids) if len(pids) < 1: create_new_person_from_bibreclist(bibreclist) else: #collect all the bibrefs #find all RA involved #decide which ones are really connected (>threshold) #merge them in the person found person_paper_list = [] for pid in pids: person_papers = [] try: person_papers = run_sql("select data from `aidPERSONID` use index (`ptf-b`) where tag=%s and " "flag >= %s and PersonID=%s", ('paper', str(bconfig.PERSONID_UPFA_PPLMF), str(pid[0]))) except (OperationalError, ProgrammingError): person_papers = run_sql("select data from `aidPERSONID` where tag=%s and " "flag >= %s and PersonID=%s", ('paper', str(bconfig.PERSONID_UPFA_PPLMF), str(pid[0]))) person_paper_list.append(person_papers) docn = len(bibreclist) bibrectdict = dict(bibreclist) compatibility_list = [] compatible_papers_count = [] for pid in person_paper_list: summa = 0.0 p_c = 0.0 for doc in pid: try: summa += float(bibrectdict[doc[0]]) p_c += 1 except: pass #print 'noindex exception!' compatibility_list.append(summa / docn) compatible_papers_count.append(p_c / docn) if bconfig.TABLES_UTILS_DEBUG: print 'update_personID_from_algorithm: Compatibility list: ' + str(compatibility_list) if max(compatibility_list) < bconfig.PERSONID_MAX_COMP_LIST_MIN_TRSH: if bconfig.TABLES_UTILS_DEBUG: print 'update_personID_from_algorithm: Max compatibility list < than 0.5!!!' pidindex = compatible_papers_count.index(max(compatible_papers_count)) if compatible_papers_count[pidindex] >= bconfig.PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N: merge_update_person_with_ra(pids[pidindex], person_paper_list[pidindex], currentRA, bibreclist) else: create_new_person_from_bibreclist(bibreclist) else: maxcount = compatibility_list.count(max(compatibility_list)) if maxcount == 1: #merge pidindex = compatibility_list.index(max(compatibility_list)) merge_update_person_with_ra(pids[pidindex], person_paper_list[pidindex], currentRA, bibreclist) elif maxcount > 1: if bconfig.TABLES_UTILS_DEBUG: print 'update_personID_from_algorithm: !!!!!!!!!!!!! Passing by, no maximum in compatibility list??' #resolve merge else: if bconfig.TABLES_UTILS_DEBUG: print 'update_personID_from_algorithm: !!!!!!!!!!!!! Error: no one is compatible!!? not doing anything...' update_personID_canonical_names() def export_personid_to_spiresid_validation(filename='/tmp/inspirepid', filename_oids='/tmp/inspirepidoids'): ''' WARNING: still to be consolidated, but output is usable WARNING^2: S L O W . @fixme: export_personid_to_spiresid_validation: use get_record, might be much faster ''' fp = open(filename, 'w') fp2 = open(filename_oids, 'w') fp.write('Personid->inspireid match:\n\n') fp2.write('Personid->inspireid match: INSPERE IDS only \n\n') pids = run_sql('SELECT personid FROM `aidPERSONID` WHERE 1 group by personid') for pid in pids: print 'considering:' + str(pid) fp.write('Considering pid' + str(pid) + '\n') fp2.write('Considering pid' + str(pid) + '\n') papers = run_sql('select data from aidPERSONID where tag=\'paper\' and ' 'personid=\'' + str(pid[0]) + '\' ') parray = [] for paper in papers: if paper[0].split(':')[0] == '700': print ' -' + str(paper) fields = run_sql('select id,value from bib70x where ' '(tag=\'700__a\') and ' 'id=\'' + str(paper[0].split(',')[0].split(':')[1]) + '\'') insid = run_sql('select id,value from bib70x where tag=\'700__i\' ' 'and (id) in ' '(select a.id_bibxxx from bibrec_bib70x as a inner join ' 'bibrec_bib70x as b using(id_bibrec)' 'where a.field_number = b.field_number and ' 'b.id_bibxxx = \'' + str(paper[0].split(',')[0].split(':')[1]) + '\' and b.id_bibrec = \'' + str(paper[0].split(',')[1]) + '\')') parray.append([fields, insid, paper]) for p in parray: fp.write(' - ' + str(p[0]) + ' ' + str(p[1]) + ' from ' + str(p[2]) + '\n') if len(p[1]) >= 1: fp2.write(' - ' + str(p[0]) + ' ' + str(p[1]) + ' from ' + str(p[2]) + '\n') fp.close() fp2.close() def export_spiresid_to_personid_validation(filename='/tmp/inspireid'): '''Build human readable validation for the SPIRES export User log case usages and contents reference. Table structure: id trans_id timestamp userinfo personID action tag value comment int int time char255 int char50 char50 char200 text Operations on papers: * Assignment: - assign bibrec,bibref to personid id trans_id timestamp userinfo personID action tag value comment xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx assign paper x00:xxxx,xxxx NULL xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx assign paper x00:xxxx,xxxx 'telephone request of the author bla bla bla' xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx assign paper x00:xxxx,xxxx 'first manual assignment, moved from pid: xxxx' * Rejection: - reject bibrec,bibref from personid id trans_id timestamp userinfo personID action tag value comment xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx reject paper x00:xxxx,xxxx NULL xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx reject paper x00:xxxx,xxxx 'telephone request of the author bla bla bla' xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx reject paper x00:xxxx,xxxx 'manual inspection of the paper' * Reset: - Reset bibrec,bibref status (don't know who really is the author) id trans_id timestamp userinfo personID action tag value comment xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx reset paper x00:xxxx,xxxx NULL xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx reset paper x00:xxxx,xxxx 'discovered error' xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx reset paper x00:xxxx,xxxx 'not enough information on the paper' Action,tag allowed couples: * assign,paper * reject,paper * reset,paper Operations on person ids: * Add: - assign info to personid id trans_id timestamp userinfo personID action tag value comment xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx data_add inspire_uid uid_inspire_xxx NULL xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx data_add email_addr xxx@xxx.xxx NULL xxx xxxxx xxxx-xx-xx uid_inspire_xxx xxxx data_mod email_addr zzz@xxx.xxx NULL Action,tag allowed couples: * data_add,inspire_uid * data_add,email_addr * data_add,full_name * data_add,address * data_add,telephone_[home|office|...] ** data_mod, data_del: same as data_add NOTE: new action/tag can be addedd as needed NOTE: in case of need comment can be used instead of value (which is limited to 255 chars), but it is important to be consistent: if a field is using comment instead of value that _must_ be done _always_. Automated operations: * Table updates: - Update_authornames_table_from_paper id trans_id timestamp userinfo personID action tag value comment xxx xxxxx xxxx-xx-xx daemon -1 UATFP bibsched status NULL Actions: * update_auntornames_table_from_paper: UATFP * authornames_tables_gc: ATGC * update_personid_table_from_paper: UPITFP ''' fp = open(filename, 'w') bibrefs = run_sql('SELECT id,tag,value,count(value) FROM `bib70x` WHERE ' '`tag` LIKE \'700__i\' group by value order by value') fp.write('Inspireid->personid match:\n\n') for i in bibrefs: print 'considering:' + str(i) # bibref = run_sql('select id,value from bib70x where tag=\'700__a\' ' # 'and (id) in (select id_bibxxx from bibrec_bib70x where ' # '(id_bibrec,field_number) in ' # '(select id_bibrec,field_number from bibrec_bib70x ' # 'where id_bibxxx = \''+str(i[0])+'\'))') bibref = run_sql('select id,value from bib70x where tag=\'700__a\' ' 'and (id) in ' '(select a.id_bibxxx from bibrec_bib70x as a inner join ' 'bibrec_bib70x as b using(id_bibrec)' 'where a.field_number = b.field_number and ' 'b.id_bibxxx = \'' + str(i[0]) + '\')') print ' found ' + str(bibref) for bib in bibref: fp.write(' -\n') pids = run_sql('select personid from aidPERSONID where tag=\'paper\'' ' and data like \'700:%,' + str(bib[0]) + '\'') fp.write(str(i) + ':\n') for pid in pids: names = run_sql('select data,flag from aidPERSONID where' ' tag=\'gathered_name\'' ' and personID=\'' + str(pid[0]) + '\'') fp.write(' -' + str(pid) + ': ' + str(names) + '\n ') fp.write('\n') fp.close() def get_user_log(transactionid='', userinfo='', personID='', action='', tag='', value='', comment='', only_most_recent=False): ''' Get user log table entry matching all the given parameters; all of them are optional. IF no parameters are given retuns the complete log table @param transactionid: id of the transaction @param userinfo: user name or identifier @param personid: id of the person involved @param action: action @param tag: tag @param value: value @param comment: comment ''' sql_query = 'select id,transactionid,timestamp,userinfo,personid,action,tag,value,comment from aidUSERINPUTLOG where 1 ' if transactionid: sql_query += ' and transactionid=\'' + str(transactionid) + '\'' if userinfo: sql_query += ' and userinfo=\'' + str(userinfo) + '\'' if personID: sql_query += ' and personid=\'' + str(personID) + '\'' if action: sql_query += ' and action=\'' + str(action) + '\'' if tag: sql_query += ' and tag=\'' + str(tag) + '\'' if value: sql_query += ' and value=\'' + str(value) + '\'' if comment: sql_query += ' and comment=\'' + str(comment) + '\'' if only_most_recent: sql_query += ' order by timestamp desc limit 0,1' return run_sql(sql_query) def insert_user_log(userinfo, personid, action, tag, value, comment='', transactionid=0, timestamp=''): ''' Instert log entries in the user log table. For example of entres look at the table generation script. @param userinfo: username or user identifier @type: string @param personid: personid involved in the transaction @type: longint @param action: action type @type: string @param tag: tag @type: string @param value: value for the transaction @type: string @param comment: optional comment for the transaction @type: string @param transactionid: optional id for the transaction @type: longint @return: the transactionid @rtype: longint ''' # if transactionid == 0: # transactionid = max(run_sql('SELECT MAX(transactionid) FROM `aidUSERINPUTLOG`')[0][0], -1) + 1 if timestamp: tsui = str(timestamp) else: tsui = run_sql('select now()')[0][0] # run_sql('insert into aidUSERINPUTLOG (transactionid,timestamp,userinfo,personid,action,tag,value,comment) values ' # '(%(transactionid)s,%(timestamp)s,%(userinfo)s,%(personid)s,%(action)s,%(tag)s,%(value)s,%(comment)s)', # ({'transactionid':str(transactionid), # 'timestamp':str(tsui), # 'userinfo':str(userinfo), # 'personid':str(personid), # 'action':str(action), # 'tag':str(tag), # 'value':str(value), # 'comment':str(comment)})) run_sql('insert into aidUSERINPUTLOG (transactionid,timestamp,userinfo,personid,action,tag,value,comment) values ' '(%s,%s,%s,%s,%s,%s,%s,%s)', (str(transactionid), str(tsui), str(userinfo), str(personid), str(action), str(tag), str(value), str(comment))) return transactionid def export_personID_to_human_readable_file(filename='/tmp/hrexport.txt', Pids=None): ''' @deprecated: support for legacy software Export the personID of each document to a human readable file, for brief inspection purposes. @param filename: filename to output to @type: string @param Pids: list of persons ids to limit the export @type: (('2',),) ''' try: from invenio.search_engine import get_record except ImportError: print "not able to import get_record!" if not Pids or len(Pids) == 0: Pids = run_sql('SELECT DISTINCT `PersonID` FROM `aidPERSONID`')# LIMIT 1,20') destfile = open(filename, 'w') for pid in Pids: if bconfig.TABLES_UTILS_DEBUG: print 'Exporting ' + str(pid) + '...' infos = run_sql('SELECT tag,data FROM `aidPERSONID` where PersonID=\'' + str(pid[0]) + '\' and not tag=\'paper\'') docs = run_sql('SELECT `data` FROM `aidPERSONID` where PersonID=\'' + str(pid[0]) + '\' and tag=\'paper\' and flag>=\'-1\'') destfile.write('Person ID: ' + str(pid[0]) + '\n') for info in infos: destfile.write(' info [' + str(info[0]) + ']: ' + str(info[1]) + '\n') for doc in docs: #title = run_sql('SELECT `value` FROM `bib24x` WHERE `id` in \ # ((select id_bibxxx from bibrec_bib24x where id_bibrec=\'' + str(doc[0].split(',')[1]) + '\')) and tag=\'245__a\'') #id = run_sql('SELECT `id_bibxxx` FROM `bibrec_bib' + ('10' if str(doc[0].split(',')[0].split(':')[0]) == '100' else '70') # + 'x` WHERE and `id`=\'' + str(doc[0].split(',')[0].split(':')[1]) + '\'') title = "No title on paper..." try: title = get_record(int(doc[0].split(',')[1]))['245'][0][0][0][1] except (IndexError, KeyError, ValueError): title = "Problem encountered while retrieving document title" dsplit = doc[0].split(',') tnum = "70" if str(dsplit[0].split(':')[0]) == "100": tnum = "10" authorname = run_sql("SELECT value FROM bib%sx " "WHERE id = %s" % (tnum, dsplit[0].split(':')[1])) destfile.write(' name: ' + str(authorname) + ' paper: [' + str(doc[0]) + ']: ' + str(title) + '\n') destfile.write('------------------------------------------------------------------------------\n') destfile.close() def export_personID_to_spires(filename='/tmp/spiresexport.txt', Pids=None): ''' @deprecated: support for legacy software Export the personID of each document to SPIRES syntax. @param filename: filename to output to @type: string @param Pids: list of persons ids to limit the export @type: (('2',),) ''' if not Pids or len(Pids) == 0: Pids = run_sql('SELECT DISTINCT `PersonID` FROM `aidPERSONID`')# LIMIT 0,20') destfile = open(filename, 'w') for pid in Pids: if bconfig.TABLES_UTILS_DEBUG: print 'Exporting ' + str(pid) + '...' docs = run_sql('SELECT `data` FROM `aidPERSONID` where PersonID=\'' + str(pid[0]) + '\' and tag=\'paper\' and flag>=\'-1\'') for doc in docs: f970a = docs = run_sql('SELECT `value` FROM `bib97x` where id=\'' + str(doc[0].split(',')[1]) + '\' and tag=\'970__a\'') dsplit = doc[0].split(',') tnum = "70" if str(dsplit[0].split(':')[0]) == "100": tnum = "10" author_number = run_sql("SELECT field_number FROM bibrec_bib%sx " "WHERE id_bibrec = %s " "AND id_bibxxx = %s" % (tnum, dsplit[1], dsplit[0].split(':')[1])) author_offset = run_sql("SELECT min(field_number) FROM bibrec_bib%sx " "WHERE id_bibrec = %s" % (tnum, dsplit[1])) # print f970a, author_number, doc # if len(author_number) >= 1: # destfile.write('merge = ' + str(f970a[0][0].split('-')[1]) + ';\nastr(' # + str(author_number[0][0]) + ');\nauthor-note(100)=INSPIRE-AUTO-' # + str(pid[0]) + ';\n;\n') if str(doc[0].split(',')[0].split(':')[0]) == '100': author_exp = 1 else: if len(author_number) >= 1: author_exp = author_number[0][0] - author_offset[0][0] + 2 else: if bconfig.TABLES_UTILS_DEBUG: print "No authornumber, setting -1!!!" author_exp = -1 if bconfig.TABLES_UTILS_DEBUG: print f970a, author_number, author_offset, author_exp, doc destfile.write('merge = ' + str(f970a[0][0].split('-')[1]) + ';\nastr(' + str(author_exp) + ');\nauthor-note(100)=INSPIRE-AUTO-' + str(pid[0]) + ';\n;\n') destfile.close() # IRN = ; # ASTR; # A= ; # AFF = ; # DESY-AUTHOR = INSPIRE-BIBAUTHOR-; def user_can_modify_data(uid, pid): ''' Return True if the uid can modify data of this personID, false otherwise. @param uid: the user id @type: int @param pid: the person id @type: int @return: can user mofidfy data? @rtype: boolean ''' pid_uid = [] try: pid_uid = run_sql("select data from aidPERSONID use index (`ptf-b`) where tag = %s and personid = %s", ('uid', str(pid))) except (OperationalError, ProgrammingError): pid_uid = run_sql("select data from aidPERSONID where tag = %s and personid = %s", ('uid', str(pid))) if len(pid_uid) >= 1: if str(uid) == str(pid_uid[0][0]): if acc_authorize_action(uid, bconfig.CLAIMPAPER_CHANGE_OWN_DATA)[0] == 0: return True if acc_authorize_action(uid, bconfig.CLAIMPAPER_CHANGE_OTHERS_DATA)[0] == 0: return True return False else: if acc_authorize_action(uid, bconfig.CLAIMPAPER_CHANGE_OTHERS_DATA)[0] == 0: return True return False def user_can_modify_paper(uid, paper): ''' Return True if the uid can modify this paper, false otherwise. If the paper is assigned more then one time (from algorithms) consider the most privileged assignment. @param uid: the user id @type: int @param paper: the paper bibref,bibrec pair x00:1234,4321 @type: str @return: can user mofidfy paper attribution? @rtype: boolean ''' prow = [] try: prow = run_sql("select id,personid,tag,data,flag,lcul from aidPERSONID use index (`tdf-b`) where tag=%s and data =%s" "order by lcul desc limit 0,1", ('paper', str(paper))) except (OperationalError, ProgrammingError): prow = run_sql("select id,personid,tag,data,flag,lcul from aidPERSONID where tag=%s and data =%s" "order by lcul desc limit 0,1", ('paper', str(paper))) if len(prow) == 0: if ((acc_authorize_action(uid, bconfig.CLAIMPAPER_CLAIM_OWN_PAPERS)[0] == 0) or (acc_authorize_action(uid, bconfig.CLAIMPAPER_CLAIM_OTHERS_PAPERS)[0] == 0)): return True return False min_req_acc_n = int(prow[0][5]) req_acc = resolve_paper_access_right(bconfig.CLAIMPAPER_CLAIM_OWN_PAPERS) pid_uid = run_sql("select data from aidPERSONID where tag = %s and personid = %s", ('uid', str(prow[0][1]))) if len(pid_uid) > 0: if (str(pid_uid[0][0]) != str(uid)) and min_req_acc_n > 0: req_acc = resolve_paper_access_right(bconfig.CLAIMPAPER_CLAIM_OTHERS_PAPERS) if min_req_acc_n < req_acc: min_req_acc_n = req_acc min_req_acc = resolve_paper_access_right(min_req_acc_n) if (acc_authorize_action(uid, min_req_acc)[0] == 0) and (resolve_paper_access_right(min_req_acc) >= min_req_acc_n): return True else: return False def resolve_paper_access_right(acc): ''' Given a string or an integer, resolves to the corresponding integer or string If asked for a wrong/not present parameter falls back to the minimum privilege. ''' access_dict = {bconfig.CLAIMPAPER_VIEW_PID_UNIVERSE: 0, bconfig.CLAIMPAPER_CLAIM_OWN_PAPERS: 25, bconfig.CLAIMPAPER_CLAIM_OTHERS_PAPERS: 50} if isinstance(acc, str): try: return access_dict[acc] except: return 0 inverse_dict = dict([[v, k] for k, v in access_dict.items()]) lower_accs = [a for a in inverse_dict.keys() if a <= acc] try: return inverse_dict[max(lower_accs)] except: return bconfig.CLAIMPAPER_VIEW_PID_UNIVERSE def resolve_data_access_right(acc): ''' Given a string or an integer, resolves to the corresponding integer or string If asked for a wrong/not present parameter falls back to the minimum privilege. ''' access_dict = {bconfig.CLAIMPAPER_VIEW_PID_UNIVERSE: 0, bconfig.CLAIMPAPER_CHANGE_OWN_DATA: 25, bconfig.CLAIMPAPER_CHANGE_OTHERS_DATA: 50} if isinstance(acc, str): try: return access_dict[acc] except: return 0 inverse_dict = dict([[v, k] for k, v in access_dict.items()]) lower_accs = [a for a in inverse_dict.keys() if a <= acc] try: return inverse_dict[max(lower_accs)] except: return bconfig.CLAIMPAPER_VIEW_PID_UNIVERSE def person_bibref_is_touched(pid, bibref): ''' Determines if a record attached to a person has been touched by a human by checking the flag. @param pid: The Person ID of the person to check the assignment from @type pid: int @param bibref: The paper identifier to be checked (e.g. "100:12,144") @type bibref: string ''' if not isinstance(pid, int): try: pid = int(pid) except (ValueError, TypeError): raise ValueError("Person ID has to be a number!") if not bibref: raise ValueError("A bibref is expected!") flag = [] try: flag = run_sql("SELECT flag FROM aidPERSONID use index (`ptf-b`) WHERE " "personid = %s AND tag = 'paper' AND data = %s" , (pid, bibref)) except (OperationalError, ProgrammingError): flag = run_sql("SELECT flag FROM aidPERSONID WHERE " "personid = %s AND tag = 'paper' AND data = %s" , (pid, bibref)) try: flag = flag[0][0] except (IndexError): return False if not flag: return False elif -2 < flag < 2: return False else: return True def get_bibref_modification_status(bibref): ''' Determines if a record attached to a person has been touched by a human by checking the flag. @param pid: The Person ID of the person to check the assignment from @type pid: int @param bibref: The paper identifier to be checked (e.g. "100:12,144") @type bibref: string returns [bool:human_modified, int:lcul] ''' if not bibref: raise ValueError("A bibref is expected!") flags = [] try: flags = run_sql("SELECT flag,lcul FROM aidPERSONID use index (`tdf-b`) WHERE " "tag = 'paper' AND data = %s" , (bibref,)) except (OperationalError, ProgrammingError): flags = run_sql("SELECT flag,lcul FROM aidPERSONID WHERE " "tag = 'paper' AND data = %s" , (bibref,)) try: flag = flags[0][0] lcul = flags[0][1] except (IndexError): return [False, 0] return [flag, lcul] def assign_person_to_uid(uid, pid): ''' Assigns a person to a userid. If person already assigned to someone else, create new person. Returns the peron id assigned. @param uid: user id, int @param pid: person id, int, if -1 creates new person. @return: pid int ''' def create_new_person_from_uid(uid): #creates a new person pid = run_sql("select max(personid) from aidPERSONID")[0][0] if pid: try: pid = int(pid) except (ValueError, TypeError): pid = -1 pid += 1 set_person_data(pid, 'uid', str(uid)) return pid if pid == -1: pid = create_new_person_from_uid(uid) return pid else: current_uid = get_person_data(pid, 'uid') if len(current_uid) == 0: set_person_data(pid, 'uid', str(uid)) return pid else: pid = create_new_person_from_uid(uid) return pid def assign_uid_to_person(uid, pid, create_new_pid=False, force=False): ''' Assigns a userid to a person, counterchecknig with get_personid_from_uid. If uid has already other person returns other person. If create_new_pid and the pid is -1 creates a new person. If force, deletes any reference to that uid from the tables and assigns to pid, if pid wrong (less then zero) returns -1. @param uid: user id, int @param pid: person id, int @param create_new_pid: bool @param force, bool ''' def create_new_person_from_uid(uid): #creates a new person pid = run_sql("select max(personid) from aidPERSONID")[0][0] if pid: try: pid = int(pid) except (ValueError, TypeError): pid = -1 pid += 1 set_person_data(pid, 'uid', str(uid)) return pid if force and pid >= 0: run_sql("delete from aidPERSONID where tag=%s and data=%s", ('uid', uid)) set_person_data(pid, 'uid', str(uid)) return pid elif force and pid < 0: return -1 current = get_personid_from_uid(((uid,),)) if current[1]: return current[0][0] else: if pid >= 0: cuid = get_person_data(pid, 'uid') if len(cuid) > 0: if str(cuid[0][1]) == str(uid): return pid else: if create_new_pid: create_new_person_from_uid(uid) else: return -1 else: set_person_data(pid, 'uid', str(uid)) return pid else: if create_new_pid: create_new_person_from_uid(uid) else: return -1 def get_personid_from_uid(uid): ''' Returns the personID associated with the provided ui. If the personID is already associated with the person the secon parameter is True, false otherwise. If there is more then one compatible results the persons are listed in order of name compatibility. If no persons are found returns ([-1],False) !!! The guessing mechanism got outdated thus disabled and replaced by arxiv_login in webapi, the code is left there for future updates !!! If there is none, associates on a best effort basis the best matching personid to the uid. @param uid: userID @type uid: ((int,),) ''' pid = run_sql("select personid from aidPERSONID where tag=%s and data=%s", ('uid', str(uid[0][0]))) if len(pid) == 1: return (pid[0], True) else: return ([-1], False) def get_personid_from_paper(bibrecref): ''' Returns the personID associated with the provided bibrec/bibref pair @param bibrecref: x00:xxxx,xxxx @type uid: str @return: pid ''' pid = run_sql("select id,personid,tag,data,flag,lcul from aidPERSONID where tag=%s and data=%s", ('paper', bibrecref)) if len(pid) == 1: return pid[0][1] else: return -1 def get_possible_bibrecref(names, bibrec, always_match=False): ''' Returns a list of bibrefs for which the surname is matching @param names: list of names strings @param bibrec: bibrec number @param always_match: match with all the names (full bibrefs list) ''' splitted_names = [] for n in names: splitted_names.append(split_name_parts(n)) bibrec_names_100 = run_sql("select o.id, o.value from bib10x o, " "(select i.id_bibxxx as iid from bibrec_bib10x i " "where id_bibrec=%s) as dummy " "where o.tag='100__a' AND o.id = dummy.iid", (str(bibrec),)) bibrec_names_700 = run_sql("select o.id, o.value from bib70x o, " "(select i.id_bibxxx as iid from bibrec_bib70x i " "where id_bibrec=%s) as dummy " "where o.tag='700__a' AND o.id = dummy.iid", (str(bibrec),)) # bibrec_names_100 = run_sql("select id,value from bib10x where tag='100__a' and id in " # "(select id_bibxxx from bibrec_bib10x where id_bibrec=%s)", # (str(bibrec),)) # bibrec_names_700 = run_sql("select id,value from bib70x where tag='700__a' and id in " # "(select id_bibxxx from bibrec_bib70x where id_bibrec=%s)", # (str(bibrec),)) bibreflist = [] for b in bibrec_names_100: spb = split_name_parts(b[1]) for n in splitted_names: if (n[0].lower() == spb[0].lower()) or always_match: if ['100:' + str(b[0]), b[1]] not in bibreflist: bibreflist.append(['100:' + str(b[0]), b[1]]) for b in bibrec_names_700: spb = split_name_parts(b[1]) for n in splitted_names: if (n[0].lower() == spb[0].lower()) or always_match: if ['700:' + str(b[0]), b[1]] not in bibreflist: bibreflist.append(['700:' + str(b[0]), b[1]]) return bibreflist def get_possible_personids_from_paperlist(bibrecreflist): ''' @param bibrecreflist: list of bibrecref couples, (('100:123,123',),) or bibrecs (('123',),) returns a list of pids and connected bibrefs in order of number of bibrefs per pid [ [['1'],['123:123.123','123:123.123']] , [['2'],['123:123.123']] ] ''' pid_bibrecref_dict = {} for b in bibrecreflist: pids = [] try: pids = run_sql("select personid from aidPERSONID use index (`tdf-b`) where tag=%s and data=%s", ('paper', str(b[0]))) except (OperationalError, ProgrammingError): pids = run_sql("select personid from aidPERSONID where tag=%s and data=%s", ('paper', str(b[0]))) for pid in pids: if pid[0] in pid_bibrecref_dict: pid_bibrecref_dict[pid[0]].append(str(b[0])) else: pid_bibrecref_dict[pid[0]] = [str(b[0])] pid_list = [[i, pid_bibrecref_dict[i]] for i in pid_bibrecref_dict] return sorted(pid_list, key=lambda k: len(k[1]), reverse=True) def get_processed_external_recids(pid): ''' Returns processed external recids @param pid: pid @return: [str] ''' db_data = get_person_data(pid, "processed_external_recids") recid_list_str = '' if db_data and db_data[0] and db_data[0][1]: recid_list_str = db_data[0][1] return recid_list_str def set_processed_external_recids(pid, recid_list_str): ''' Set processed external recids @param pid: pid @param recid_list_str: str ''' del_person_data(pid, "processed_external_recids") set_person_data(pid, "processed_external_recids", recid_list_str) def get_persons_from_recids(recids, return_alt_names=False, return_all_person_papers=False): ''' Function to find person informations as occuring on records @param recids: List of rec IDs @type recids: list of int @param return_alt_names: Return all name variations? @type return_alt_names: boolean @param return_all_person_papers: Return also a person's record IDs? @type return_all_person_papers: boolean return: tuple of two dicts: structure: ({recid: [personids]}, {personid: {personinfo}}) example: ({1: [4]}, {4: {'canonical_id' : str, 'alternatative_names': list of str, 'person_records': list of int } }) rtype: tuple of two dicts ''' rec_pid = {} pinfo = {} if not isinstance(recids, list) or isinstance(recids, tuple): if isinstance(recids, int): recids = [recids] else: return (rec_pid, pinfo) if not DATA_CACHERS: DATA_CACHERS.append(PersonIDStatusDataCacher()) pid_table_cacher = DATA_CACHERS[0] pid_table_cacher.recreate_cache_if_needed() if not pid_table_cacher.cache: return (rec_pid, pinfo) for recid in recids: rec_names = get_field_values_on_condition(recid, source='API', get_table=['100', '700'], get_tag='a') for rname in rec_names: rname = rname.encode('utf-8') rec_bibrefs = run_sql("select bibrefs from aidAUTHORNAMES where " "db_name=%s", (rname,)) if not rec_bibrefs: continue rec_bibrefs = rec_bibrefs[0][0].split(',') bibrefrec = "" if len(rec_bibrefs) > 1: for ref in rec_bibrefs: table, refid = ref.split(":") tmp = None if table == "100": tmp = run_sql("select id_bibrec from bibrec_bib10x " "where id_bibxxx=%s and id_bibrec=%s", (refid, recid)) elif table == "700": tmp = run_sql("select id_bibrec from bibrec_bib70x " "where id_bibxxx=%s and id_bibrec=%s", (refid, recid)) else: continue if tmp: bibrefrec = "%s,%s" % (ref, recid) break else: try: bibrefrec = "%s,%s" % (rec_bibrefs[0], recid) except IndexError: pass if bibrefrec: pids = [] try: pids = run_sql("select personid from aidPERSONID " "use index (`tdf-b`) where " "tag=%s and data=%s and flag > -1", ('paper', bibrefrec)) except (ProgrammingError, OperationalError): pids = run_sql("select personid from aidPERSONID " "where " "tag=%s and data=%s and flag > -1", ('paper', bibrefrec)) pids = [i[0] for i in pids] for pid in pids: if recid in rec_pid: rec_pid[recid].append(pid) else: rec_pid[recid] = [pid] if pid in pinfo: continue pinfo[pid] = {} cid = "" try: cid = get_person_data(pid, "canonical_name")[0][1] except IndexError: pass pinfo[pid]["canonical_id"] = cid if return_alt_names: anames = get_person_db_names_count((pid,)) pinfo[pid]["alternative_names"] = [anm[0] for anm in anames] if return_all_person_papers: pinfo[pid]["person_records"] = get_person_papers( (pid,), -1, show_author_name=True, show_title=False) return (rec_pid, pinfo) def get_personid_status_cacher(): ''' Returns a DataCacher object describing the status of the pid table content @return: DataCacher Object @rtype: DataCacher ''' if not DATA_CACHERS: DATA_CACHERS.append(PersonIDStatusDataCacher()) return DATA_CACHERS[0] def _pfap_printmsg(identity, msg): if bconfig.TABLES_UTILS_DEBUG: print (time.strftime('%H:%M:%S') + ' personid_fast_assign_papers ' + str(identity) + ': ' + msg) def _pfap_assign_bibrefrec(i, tab, bibref, bibrec, namestring): name_parts = split_name_parts(namestring) pid_names_rows = run_sql("select personid,data from aidPERSONID where tag='gathered_name' and data like %s ", (name_parts[0] + ',%',)) pid_names_dict = {} for pid in pid_names_rows: pid_names_dict[pid[1]] = pid[0] del pid_names_rows names_comparison_list = [] for name in pid_names_dict.keys(): names_comparison_list.append([name, compare_names(name, namestring)]) names_comparison_list = sorted(names_comparison_list, key=lambda x: x[1], reverse=True) _pfap_printmsg('BibrefAss: ' + str(i), ' Top name comparison list against %s: %s' % (namestring, str(names_comparison_list[0:3]))) if len(names_comparison_list) > 0 and names_comparison_list[0][1] > 0: _pfap_printmsg('BibrefAss: ' + str(i), ' Assigning to the best fit: %s' % str(pid_names_dict[names_comparison_list[0][0]])) run_sql("insert into aidPERSONID (personid,tag,data,flag,lcul) values (%s,'paper',%s,'0','0')", (str(pid_names_dict[names_comparison_list[0][0]]), tab + ':' + str(bibref) + ',' + str(bibrec))) update_personID_names_string_set([[pid_names_dict[names_comparison_list[0][0]]]]) update_personID_canonical_names([[pid_names_dict[names_comparison_list[0][0]]]]) else: _pfap_printmsg('BibrefAss: ' + str(i), 'Creating a new person...') personid = run_sql("select max(personid)+1 from aidPERSONID")[0][0] run_sql("insert into aidPERSONID (personid,tag,data,flag,lcul) values (%s,'paper',%s,'0','0')", (personid, tab + ':' + str(bibref) + ',' + str(bibrec))) update_personID_names_string_set([[personid]]) update_personID_canonical_names([[personid]]) def _pfap_assign_paper(i, p_q, atul): ''' bibrec = 123 ''' while True: try: bibrec = p_q.get_nowait() except Empty: return _pfap_printmsg('Assigner: ' + str(i), 'Starting on paper: %s' % bibrec) b100 = run_sql("select b.id,b.value from bib10x as b, bibrec_bib10x as a where b.id=a.id_bibxxx and b.tag=%s and a.id_bibrec=%s", ('100__a', bibrec)) b700 = run_sql("select b.id,b.value from bib70x as b, bibrec_bib70x as a where b.id=a.id_bibxxx and b.tag=%s and a.id_bibrec=%s", ('700__a', bibrec)) _pfap_printmsg('Assigner: ' + str(i), 'Found: %s 100: and %s 700:' % (len(b100), len(b700))) for bibref in b100: present = run_sql("select count(data)>0 from aidPERSONID where tag='paper' and data =%s", ('100:' + str(bibref[0]) + ',' + str(bibrec),))[0][0] if not present: _pfap_printmsg('Assigner: ' + str(i), 'Found: 100:%s,%s not assigned, assigning...' % (str(bibref[0]), str(bibrec))) _pfap_assign_bibrefrec(i, '100', bibref[0], bibrec, bibref[1]) for bibref in b700: present = run_sql("select count(data)>0 from aidPERSONID where tag='paper' and data =%s", ('700:' + str(bibref[0]) + ',' + str(bibrec),))[0][0] if not present: _pfap_printmsg('Assigner: ' + str(i), 'Found: 700:%s,%s not assigned, assigning...' % (str(bibref[0]), str(bibrec))) _pfap_assign_bibrefrec(i, '700', bibref[0], bibrec, bibref[1]) atul.acquire() update_authornames_tables_from_paper([[bibrec]]) atul.release() _pfap_printmsg('Assigner: ' + str(i), 'Done with: %s' % bibrec) def personid_fast_assign_papers(paperslist=None): ''' Assign papers to the most compatible person. Compares only the name to find the right person to assign to. If nobody seems compatible, create a new person. ''' _pfap_printmsg('starter', 'Started') if not paperslist: #paperslist = run_sql('select id from bibrec where 1') paperslist = [[x] for x in perform_request_search(p="")] paperslist = [k[0] for k in paperslist] _pfap_printmsg('starter', 'Starting on %s papers ' % len(paperslist)) authornames_table_update_lock = multiprocessing.Lock() papers_q = multiprocessing.Queue() for p in paperslist: papers_q.put(p) process_list = [] for c in range(4 * bconfig.BIBAUTHORID_MAX_PROCESSES): p = multiprocessing.Process(target=_pfap_assign_paper, args=(c, papers_q, authornames_table_update_lock)) process_list.append(p) p.start() for p in process_list: p.join() diff --git a/modules/bibauthorid/lib/bibauthorid_structs.py b/modules/bibauthorid/lib/bibauthorid_structs.py index a5712cecb..35c58455d 100644 --- a/modules/bibauthorid/lib/bibauthorid_structs.py +++ b/modules/bibauthorid/lib/bibauthorid_structs.py @@ -1,332 +1,331 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - ''' bibauthorid_structs Defines the data structures for computation in memory and acts as bibauthorid's memory storage facility. ''' import Queue # pylint: disable=W0105 AUTHOR_NAMES = [] ''' AUTHOR_NAMES Holds data from the aidAUTHORNAMES table. Structure: [{tag: value}*] Example: [{'id': '1', 'name': 'Groom, Donald E.', 'bibrefs': '100:1,700:9912', 'db_name': 'Groom, Donald E.'}, {'id': '2', 'name': 'de Sacrobosco, Johannes', 'bibrefs': '100:4', 'db_name': 'de Sacrobosco, Johannes'} ] ''' DOC_LIST = [] ''' DOC_LIST Holds data from the aidDOCLIST table. Structure: [{tag: value}*] Example: [{'bibrecid': 680600L, 'authornameids': [305005L, 44341L], 'authornameid_bibrefrec' : [(305005L, "100:133,680600")]}, {'bibrecid': 681822L, 'authornameids': [305005L], 'authornameid_bibrefrec' : [(305005L, "100:133,681822")]}] ''' REALAUTHORS = [] ''' REALAUTHORS Holds data from the aidREALAUTHORS table. Structure: [{tag: value}*] Example: [{'realauthorid': '1', 'virtualauthorid': '1020', 'p': '0.5453'} ] ''' REALAUTHOR_DATA = [] ''' REALAUTHOR_DATA Holds data from the aidREALAUTHORDATA table. Structure: [{tag: value}*] Example: [{'realauthorid: '1', 'tag': 'affiliation', 'value': '2003-04;;Chen, Alex;;Fermilab', 'va_count': '1', 'va_np': '0', 'va_p': '0'}, {'realauthorid: '1', 'tag': 'affiliation', 'value': '2007-05;;Chen, Alex Zuxing;;Fermilab', 'va_count': '1', 'va_np': '0', 'va_p': '0'}, {'realauthorid: '1', 'tag': 'coauthor', 'value': 'Chen, Alex;;Chou, W.', 'va_count': '1', 'va_np': '0', 'va_p': '0'} ] ''' VIRTUALAUTHORS = [] ''' VIRTUALAUTHORS Holds data from the aidVIRTUALAUTHORS table. Structure: [{tag: value}*] Example: [{'virtualauthorid': '3', 'authornamesid': '42555', 'p': '0.9', 'clusterid': '2'} ] ''' VIRTUALAUTHOR_DATA = [] ''' VIRTUALAUTHOR_DATA Holds data from the aidVIRTUALAUTHORSDATA table. Structure: [{tag: value}*] Example: [{'virtualauthorid' : '1' 'tag': 'authorIndex' 'value': '0'}, {'virtualauthorid' : '1', 'tag': 'bibrec_id', 'value': '680600', {'virtualauthorid' : '1', 'tag': 'connected', 'value': 'False'}, {'virtualauthorid' : '1', 'tag': 'orig_authorname_id', 'value': '305005'}, {'virtualauthorid' : '1', 'tag': 'orig_name_string', 'value': 'Chen, .J.'} ] ''' VIRTUALAUTHOR_CLUSTERS = [] ''' VIRTUALAUTHOR_CLUSTERS Holds data from the aidVIRTUALAUTHORS_clusters table. Structure: [{tag: value}*] Example: [{'clusterid': '1', 'clustername': 'Chen, A.'}, {'clusterid': '2', 'clustername': 'Chen, A. A.'} ] ''' VIRTUALAUTHOR_CLUSTER_CACHE = {} ''' VIRTUALAUTHOR_CLUSTER_CACHE Holds Name->ClusterID Mappings Structure: {'Name': [ClusterIDs]} Example: {'Ellison, J.': [2170, 2171, 2172, 2173, 2174, 2175, 2176, 2177, 2178]} ''' VIRTUALAUTHOR_PROCESS_QUEUE = Queue.Queue() ''' VIRTUALAUTHOR_PROCESS_QUEUE Holds the virtual author ids that are to be processed. ''' ID_TRACKER = {} ''' ID_TRACKER Holds information about the current/next id of virtual or real author entities. Structure: {tracker name: value} Example: {"va_id_counter": 12332L, "raid_counter": 122L, "last_updated_va": 122L, "cluster_id": 166} ''' RELEVANT_RECORDS = {} ''' RELEVANT_RECORDS Holds all the information about the documents referenced by the authors in memory Structure: {bibrecid: data dict from get_record} Example: {1: {'001': [([], ' ', ' ', '3', 1)], '035': [([('a', 'Test:1750qe'), ('9', 'SPIRESTeX')], ' ', '', '', 10)], '100': [([('a', 'Test, Author J.')], ' ', ' ', '', 3)], '245': [([('a', 'The test record')], ' ', ' ', '', 4)], '260': [([('c', '1750')], ' ', ' ', '', 6)], '269': [([('c', '1750')], ' ', ' ', '', 5)], '690': [([('a', 'Preprint')], 'C', ' ', '', 2)], '961': [([('x', '2001-11-12')], ' ', ' ', '', 7),         ([('c', '2003-07-21')], ' ', ' ', '', 8)], '970': [([('a', 'SPIRES-4772695')], ' ', ' ', '', 9)], '980': [([('a', 'unknown')], ' ', ' ', '', 11)]} } ''' RA_VA_CACHE = {} ''' RA_VA_CACHE Holds information about the connection of virtual authors to real authors This caching allows an enormous speedup compared to assessing this information each time separately. Structure: {hashtag of VA IDs: list of RA IDs} ''' CITES_DICT = {} CITED_BY_DICT = {} ''' CITES_DICT and CITED_BY_DICT Hold information about citations for the job creation process. Structure: {id_bibrec: [list of bibrecs that are cited by/cite the key]} ''' UPDATES_LOG = {"deleted_vas": set(), "touched_vas": set(), "new_ras": set(), "new_vas": set(), "rec_updates": set()} ''' UPDATES_LOG Keeps track of updated RAs and VAs to minimize database activities upon updating it from the mem cache. Structure: {"tag": set of ids} ''' RUNTIME_CONFIG = {"populate_aid_from_personid": False} # pylint: enable=W0105 def update_log(logbook, value): ''' Adds a value to the set of the logbook @param logbook: the name of the log @type logbook: string @param value: the value to add @type value: int ''' logbooks = ("deleted_vas", "touched_vas", "new_ras", "new_vas", "rec_updates") if not logbook in logbooks: raise ValueError("Valid logbooks are %s" % str(logbooks)) if not isinstance(value, int): try: value = int(value) except (ValueError, TypeError): raise ValueError("Logbook Value must be an int.") if logbook in UPDATES_LOG: UPDATES_LOG[logbook].add(value) else: UPDATES_LOG[logbook] = set((value,)) def set_tracker(tracker_name, value): ''' Sets a specified tracker to a specified value @param tracker_name: the name of the tracker (e.g. va_id_counter) @type tracker_name: string @param value: the value the tracker shall be updated to @type value: int ''' if not isinstance(value, int): try: value = int(value) except (ValueError, TypeError): raise ValueError("Tracker Value is expected to be an int!") if tracker_name in ID_TRACKER: ID_TRACKER[tracker_name].add(value) else: ID_TRACKER[tracker_name] = value def increment_tracker(tracker_name): ''' Increments a specified tracker by one (1). @param tracker_name: the name of the tracker (e.g. va_id_counter) @type tracker_name: string @return: the new value of the tracker @rtype: int ''' if tracker_name in ID_TRACKER: ID_TRACKER[tracker_name] += 1 else: ID_TRACKER[tracker_name] = 1 return ID_TRACKER[tracker_name] def reset_mem_cache(doit=False): ''' This function will reset the memory cache. @param doit: Tell me, if you really want to do this. Defaults to false @type doit: boolean ''' if doit: AUTHOR_NAMES[:] = [] DOC_LIST[:] = [] REALAUTHORS[:] = [] REALAUTHOR_DATA[:] = [] VIRTUALAUTHORS[:] = [] VIRTUALAUTHOR_DATA[:] = [] VIRTUALAUTHOR_CLUSTERS[:] = [] VIRTUALAUTHOR_CLUSTER_CACHE.clear() VIRTUALAUTHOR_PROCESS_QUEUE.queue.clear() ID_TRACKER.clear() RELEVANT_RECORDS.clear() RA_VA_CACHE.clear() for key in UPDATES_LOG: UPDATES_LOG[key] = set() diff --git a/modules/bibauthorid/lib/bibauthorid_tables_utils.py b/modules/bibauthorid/lib/bibauthorid_tables_utils.py index 464a4c050..3f9c73b0b 100644 --- a/modules/bibauthorid/lib/bibauthorid_tables_utils.py +++ b/modules/bibauthorid/lib/bibauthorid_tables_utils.py @@ -1,1650 +1,1682 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ bibauthorid_tables_utils Bibauthorid's DB handler """ import sys import re import random import bibauthorid_config as bconfig import bibauthorid_structs as dat from bibauthorid_utils import split_name_parts, create_normalized_name from bibauthorid_utils import clean_name_string from bibauthorid_authorname_utils import update_doclist try: from search_engine import get_record, perform_request_search # from search_engine_utils import get_fieldvalues from bibrank_citation_searcher import get_citation_dict from dbquery import run_sql, run_sql_many from dbquery import OperationalError, ProgrammingError except ImportError: # from invenio.search_engine_utils import get_fieldvalues from invenio.search_engine import get_record, perform_request_search from invenio.bibrank_citation_searcher import get_citation_dict from invenio.dbquery import run_sql, run_sql_many from invenio.dbquery import OperationalError, ProgrammingError try: import unidecode UNIDECODE_ENABLED = True except ImportError: bconfig.LOGGER.error("Authorid will run without unidecode support! " "This is not recommended! Please install unidecode!") UNIDECODE_ENABLED = False def get_papers_recently_modified(date='00-00-00 00:00:00'): ''' Returns the bibrecs with modification date more recent then date, or all the bibrecs if no date is specified. @param date: date ''' papers = run_sql("select id from bibrec where modification_date > %s", (str(date),)) if papers: bibrecs = [i[0] for i in papers] bibrecs.append(-1) min_date = run_sql("select max(modification_date) from bibrec where " "id in " + str(tuple(bibrecs))) else: min_date = run_sql("select now()") return papers, min_date def populate_authornames_bibrefs_from_authornames(): ''' Populates aidAUTHORNAMESBIBREFS. For each entry in aidAUTHORNAMES creates a corresponding entry in aidA.B. so it's possible to search by bibrec/bibref at a reasonable speed as well and not only by name. ''' nids = run_sql("select id,bibrefs from aidAUTHORNAMES") for nid in nids: for bibref in nid[1].split(','): if bconfig.TABLES_UTILS_DEBUG: print ('populate_authornames_bibrefs_from_authornames: Adding: ' ' %s %s' % (str(nid[0]), str(bibref))) run_sql("insert into aidAUTHORNAMESBIBREFS (Name_id, bibref) " "values (%s,%s)", (str(nid[0]), str(bibref))) def authornames_tables_gc(bunch_size=50): ''' Performs garbage collecting on the authornames tables. Potentially really slow. ''' bunch_start = run_sql("select min(id) from aidAUTHORNAMESBIBREFS") if len(bunch_start) >= 1: bunch_start = int(bunch_start[0][0]) else: return abfs_ids_bunch = run_sql("select id,Name_id,bibref from aidAUTHORNAMESBIBREFS limit " + str(bunch_start - 1) + "," + str(bunch_size)) bunch_start += bunch_size while len(abfs_ids_bunch) >= 1: bib100list = [] bib700list = [] for i in abfs_ids_bunch: if i[2].split(':')[0] == '100': bib100list.append(i[2].split(':')[1]) elif i[2].split(':')[0] == '700': bib700list.append(i[2].split(':')[1]) bib100liststr = '( ' for i in bib100list: bib100liststr += "'" + str(i) + "'," bib100liststr = bib100liststr[0:len(bib100liststr) - 1] + " )" bib700liststr = '( ' for i in bib700list: bib700liststr += "'" + str(i) + "'," bib700liststr = bib700liststr[0:len(bib700liststr) - 1] + " )" if len(bib100list) >= 1: bib10xids = run_sql("select id from bib10x where id in %s" % bib100liststr) else: bib10xids = [] if len(bib700list) >= 1: bib70xids = run_sql("select id from bib70x where id in %s" % bib700liststr) else: bib70xids = [] bib10xlist = [] bib70xlist = [] for i in bib10xids: bib10xlist.append(str(i[0])) for i in bib70xids: bib70xlist.append(str(i[0])) bib100junk = set(bib100list).difference(set(bib10xlist)) bib700junk = set(bib700list).difference(set(bib70xlist)) idsdict = {} for i in abfs_ids_bunch: idsdict[i[2]] = [i[0], i[1]] junklist = [] for i in bib100junk: junklist.append('100:' + i) for i in bib700junk: junklist.append('700:' + i) for junkref in junklist: try: id_to_remove = idsdict[junkref] run_sql("delete from aidAUTHORNAMESBIBREFS where id=%s", (str(id_to_remove[0]),)) if bconfig.TABLES_UTILS_DEBUG: print "authornames_tables_gc: idAUTHORNAMESBIBREFS deleting row " + str(id_to_remove) authrow = run_sql("select id,Name,bibrefs,db_name from aidAUTHORNAMES where id=%s", (str(id_to_remove[1]),)) if len(authrow[0][2].split(',')) == 1: run_sql("delete from aidAUTHORNAMES where id=%s", (str(id_to_remove[1]),)) if bconfig.TABLES_UTILS_DEBUG: print "authornames_tables_gc: aidAUTHORNAMES deleting " + str(authrow) else: bibreflist = '' for ref in authrow[0][2].split(','): if ref != junkref: bibreflist += ref + ',' bibreflist = bibreflist[0:len(bibreflist) - 1] run_sql("update aidAUTHORNAMES set bibrefs=%s where id=%s", (bibreflist, id_to_remove[1])) if bconfig.TABLES_UTILS_DEBUG: print "authornames_tables_gc: aidAUTHORNAMES updating " + str(authrow) + ' with ' + str(bibreflist) except (OperationalError, ProgrammingError, KeyError, IndexError, ValueError, TypeError): pass abfs_ids_bunch = run_sql("select id,Name_id,bibref from aidAUTHORNAMESBIBREFS limit " + str(bunch_start - 1) + ',' + str(bunch_size)) bunch_start += bunch_size def update_authornames_tables_from_paper(papers_list=None): """ Updates the authornames tables with the names on the given papers list @param papers_list: list of the papers which have been updated (bibrecs) ((1,),) For each paper of the list gathers all names, bibrefs and bibrecs to be added to aidAUTHORNAMES table, taking care of updating aidA.B. as well NOTE: update_authornames_tables_from_paper: just to remember: get record would be faster but we don't have the bibref there, maybe there is a way to rethink everything not to use bibrefs? How to address authors then? """ def update_authornames_tables(name, bibref): ''' Update the tables for one bibref,name touple ''' authornames_row = run_sql("select id,Name,bibrefs,db_name from aidAUTHORNAMES where db_name like %s", (str(name),)) authornames_bibrefs_row = run_sql("select id,Name_id,bibref from aidAUTHORNAMESBIBREFS " "where bibref like %s", (str(bibref),)) #@XXX: update_authornames_tables: if i'm not wrong there should always be only one result; will be checked further on if ((len(authornames_row) > 1) or (len(authornames_bibrefs_row) > 1) or (len(authornames_row) < len(authornames_bibrefs_row))): if bconfig.TABLES_UTILS_DEBUG: print "update_authornames_tables: More then one result or missing authornames?? Something is wrong, not updating" + str(authornames_row) + str(authornames_bibrefs_row) return if len(authornames_row) == 1: # we have an hit for the name string; check if there is the 'new' bibref associated, # if yes there is nothing to do, otherwise shold add it here and in the ANbibrefs table if authornames_row[0][2].count(bibref) < 1: if bconfig.TABLES_UTILS_DEBUG: print 'update_authornames_tables: Adding new bibref to ' + str(authornames_row) + ' ' + str(name) + ' ' + str(bibref) run_sql("update aidAUTHORNAMES set bibrefs=%s where id=%s", (authornames_row[0][2] + ',' + str(bibref), authornames_row[0][0])) if len(authornames_bibrefs_row) < 1: # we have to add the bibref to the name, would be strange if it would be already there run_sql("insert into aidAUTHORNAMESBIBREFS (Name_id,bibref) values (%s,%s)", (authornames_row[0][0], str(bibref))) else: if bconfig.TABLES_UTILS_DEBUG: print 'update_authornames_tables: Nothing to add to ' + str(authornames_row) + ' ' + str(name) + ' ' + str(bibref) else: #@NOTE: update_authornames_tables: we don't have the name string in the db: the name associated to the bibref is changed # or this is a new name? Experimenting with bibulpload looks like if a name on a paper changes a new bibref is created; # if len(authornames_bibrefs_row) == 1: # If len(authornames_row) is zero but we have a row in authornames_bibrefs_row means that # the name string is changed, somehow! # @FIXME: update_authornames_tables: this case should really be considered? if bconfig.TABLES_UTILS_DEBUG: print 'update_authornames_tables: The name associated to the bibref is changed?? ' + str(name) + ' ' + str(bibref) else: artifact_removal = re.compile("[^a-zA-Z0-9]") authorname = "" test_name = name.decode('utf-8') if UNIDECODE_ENABLED: test_name = unidecode.unidecode(name.decode('utf-8')) raw_name = artifact_removal.sub("", test_name) if len(raw_name) > 1: authorname = name.decode('utf-8') if len(raw_name) > 1: dbname = authorname else: dbname = 'Error in name parsing!' clean_name = create_normalized_name(split_name_parts(name)) authornamesid = run_sql("insert into aidAUTHORNAMES (Name,bibrefs,db_name) values (%s,%s,%s)", (clean_name, str(bibref), dbname)) run_sql("insert into aidAUTHORNAMESBIBREFS (Name_id,bibref) values (%s,%s)", (authornamesid, str(bibref))) if bconfig.TABLES_UTILS_DEBUG: print 'update_authornames_tables: Created new name ' + str(authornamesid) + ' ' + str(name) + ' ' + str(bibref) tables = [['bibrec_bib10x', 'bib10x', '100__a', '100'], ['bibrec_bib70x', 'bib70x', '700__a', '700']] if not papers_list: papers_list = run_sql("select id from bibrec") for paper in papers_list: for table in tables: sqlstr = "select id_bibxxx from %s where id_bibrec=" % table[0] bibrefs = run_sql(sqlstr + "%s", (str(paper[0]),)) for ref in bibrefs: sqlstr = "select value from %s where tag='%s' and id=" % (table[1], table[2]) name = run_sql(sqlstr + "%s", (str(ref[0]),)) if len(name) >= 1: update_authornames_tables(name[0][0], table[3] + ':' + str(ref[0])) def populate_authornames(): """ Author names table population from bib10x and bib70x Average Runtime: 376.61 sec (6.27 min) for 327k entries Should be called only with empty table, then use update_authornames_tables_from_paper with the new papers which are coming in or modified. """ max_rows_per_run = bconfig.TABLE_POPULATION_BUNCH_SIZE if max_rows_per_run == -1: max_rows_per_run = 5000 + max100 = run_sql("SELECT COUNT(id) FROM bib10x WHERE tag = '100__a'") + max700 = run_sql("SELECT COUNT(id) FROM bib70x WHERE tag = '700__a'") + tables = "bib10x", "bib70x" authornames_is_empty_checked = 0 authornames_is_empty = 1 # Bring author names from bib10x and bib70x to authornames table for table in tables: if table == "bib10x": table_number = "100" else: table_number = "700" querylimiter_start = 0 querylimiter_max = eval('max' + str(table_number) + '[0][0]') if bconfig.TABLES_UTILS_DEBUG: print "\nProcessing %s (%s entries):" % (table, querylimiter_max) sys.stdout.write("0% ") sys.stdout.flush() while querylimiter_start <= querylimiter_max: if bconfig.TABLES_UTILS_DEBUG: sys.stdout.write(".") sys.stdout.flush() percentage = int(((querylimiter_start + max_rows_per_run) * 100) / querylimiter_max) sys.stdout.write(".%s%%." % (percentage)) sys.stdout.flush() # Query the Database for a list of authors from the correspondent # tables--several thousands at a time bib = run_sql("SELECT id, value FROM %s WHERE tag = '%s__a' " "LIMIT %s, %s" % (table, table_number, querylimiter_start, max_rows_per_run)) authorexists = None querylimiter_start += max_rows_per_run for i in bib: # For mental sanity, exclude things that are not names... # Yes, I know that there are strange names out there! # Yes, I read the 40 misconceptions about names. # Yes, I know! # However, these statistical outlaws are harmful. artifact_removal = re.compile("[^a-zA-Z0-9]") authorname = "" if not i[1]: continue test_name = i[1].decode('utf-8') if UNIDECODE_ENABLED: test_name = unidecode.unidecode(i[1].decode('utf-8')) raw_name = artifact_removal.sub("", test_name) if len(raw_name) > 1: authorname = i[1].decode('utf-8') if not authorname: continue if not authornames_is_empty_checked: authornames_is_empty = run_sql("SELECT COUNT(id) " "FROM aidAUTHORNAMES") if authornames_is_empty[0][0] == 0: authornames_is_empty_checked = 1 authornames_is_empty = 1 if not authornames_is_empty: # Find duplicates in the database and append id if # duplicate is found authorexists = run_sql("SELECT id, name, bibrefs, db_name " "FROM aidAUTHORNAMES " "WHERE db_name = %s", (authorname.encode("utf-8"),)) bibrefs = "%s:%s" % (table_number, i[0]) if not authorexists: insert_name = "" if len(authorname) > 240: bconfig.LOGGER.warn("\nName too long, truncated to 254" " chars: %s" % (authorname)) insert_name = authorname[0:254] else: insert_name = authorname cnn = create_normalized_name snp = split_name_parts aid_name = authorname if UNIDECODE_ENABLED: aid_name = cnn(snp(unidecode.unidecode(insert_name))) aid_name = aid_name.replace("\"", "") else: aid_name = cnn(snp(insert_name)) aid_name = aid_name.replace(u"\u201c", "") aid_name = aid_name.replace(u"\u201d", "") run_sql("INSERT INTO aidAUTHORNAMES VALUES" " (NULL, %s, %s, %s)", (aid_name.encode('utf-8'), bibrefs, insert_name.encode('utf-8'))) if authornames_is_empty: authornames_is_empty = 0 else: if authorexists[0][2].count(bibrefs) >= 0: upd_bibrefs = "%s,%s" % (authorexists[0][2], bibrefs) run_sql("UPDATE aidAUTHORNAMES SET bibrefs = " "%s WHERE id = %s", (upd_bibrefs, authorexists[0][0])) if bconfig.TABLES_UTILS_DEBUG: sys.stdout.write(" Done.") sys.stdout.flush() def get_bibref_name_string(bibref): ''' Returns the name string associated with the given bibref @param: bibref ((100:123,),) ''' name = run_sql("select db_name from aidAUTHORNAMES where id=(select Name_id from aidAUTHORNAMESBIBREFS where bibref=%s)", (str(bibref[0][0]),)) if len(name) > 0: return name[0][0] else: return '' def get_bibrefs_from_name_string(string): ''' Returns bibrefs associated to a name string @param: string: name ''' bibrefs = run_sql("select bibrefs from aidAUTHORNAMES where db_name=%s ", (str(string),)) return bibrefs def get_diff_marc10x70x_to_anames(): ''' Determines the difference between the union of bib10x and bib70x and the aidAUTHORNAMES table. It will return the entries which are present in bib10x and bib70x but not in aidAUTHORNAMES. Meant to be run periodically. @todo: get_diff_marc10x70x_to_anames: find meaningful use for the returned results. @return: a list of the author names not contained in the authornames table @rtype: list ''' run_sql("DROP VIEW authors") run_sql("create view authors AS \ (SELECT value FROM bib10x WHERE tag =\"100__a\") \ UNION \ (SELECT value FROM bib70x WHERE tag =\"700__a\")") diff = run_sql("SELECT value from authors LEFT JOIN aidAUTHORNAMES as b" + " ON (authors.value = b.Name) WHERE b.name IS NULL") return diff def populate_doclist_for_author_surname(surname, surname_variations=None): """ Searches for all the documents containing a given surname and processes them: creates the virtual author for each author on a document. @param surname: The search is based on this last name. @type surname: string """ # if not dat.CITES_DICT: # cites = get_citation_dict("citationdict") # # for key in cites: # dat.CITES_DICT[key] = cites[key] # # if not dat.CITED_BY_DICT: # cited_by = get_citation_dict("reversedict") # # for key in cited_by: # dat.CITED_BY_DICT[key] = cited_by[key] bconfig.LOGGER.log(25, "Populating document list for %s" % (surname)) if surname_variations: init_authornames(surname, surname_variations) else: init_authornames(surname) authors = [row for row in dat.AUTHOR_NAMES if not row['processed']] for author in authors: marc_100 = [] marc_700 = [] temp_marc = author['bibrefs'].split(',') for j in temp_marc: marcfield, internalid = j.split(':') if marcfield == '100': marc_100.append(internalid) elif marcfield == '700': marc_700.append(internalid) else: bconfig.LOGGER.error("Wrong MARC field. How did you do" " that?!--This should never happen! boo!") bibrecs = [] if marc_100: for m100 in marc_100: refinfo = run_sql("SELECT id_bibrec FROM bibrec_bib10x " "WHERE id_bibxxx = %s", (m100,)) if refinfo: for recid in refinfo: bibrecs.append((recid[0], "100:%s" % m100)) if marc_700: for m700 in marc_700: refinfo = run_sql("SELECT id_bibrec FROM bibrec_bib70x " "WHERE id_bibxxx = %s", (m700,)) if refinfo: for recid in refinfo: bibrecs.append((recid[0], "700:%s" % m700)) # # relevant_records = [] # # for bibrec in bibrecs: # go_next = False # # for value in get_fieldvalues(bibrec[0], "980__c"): # if value.lower().count('delete'): # go_next = True # # if go_next: # continue # # for value in get_fieldvalues(bibrec[0], "980__a"): # if value.lower().count('delet'): # go_next = True # # if bconfig.EXCLUDE_COLLECTIONS: # if value in bconfig.EXCLUDE_COLLECTIONS: # go_next = True # break # # if bconfig.LIMIT_TO_COLLECTIONS: # if not value in bconfig.LIMIT_TO_COLLECTIONS: # go_next = True # else: # go_next = False # break # # if go_next: # continue # # relevant_records.append(bibrec) # # if load_records_to_mem_cache([br[0] for br in relevant_records]): # for bibrec in relevant_records: # update_doclist(bibrec[0], author['id'], bibrec[1]) # authornames = [row["name"] for row in authors] relevant_records = [] coll_limit = "" coll_excl = "" authorqry = "" query = "" # recqry = "" if bconfig.EXCLUDE_COLLECTIONS: coll_excl = 'and not (collection:"%s")' % '" or collection:"'.join(bconfig.EXCLUDE_COLLECTIONS) if bconfig.LIMIT_TO_COLLECTIONS: coll_limit = 'and (collection:"%s")' % '" or collection:"'.join(bconfig.LIMIT_TO_COLLECTIONS) # if bibrecs: # recqry = '(recid:%s)' % ' or recid:'.join([str(r) for r in bibrecs]) if author["db_name"]: authorqry = '(exactauthor:"%s")' % author["db_name"] if authorqry: query = "%s %s %s" % (authorqry, coll_excl, coll_limit) query = query.strip() if query: se_results = list(perform_request_search(p=query)) relevant_records = [row for row in bibrecs if int(row[0]) in se_results] if dat.RUNTIME_CONFIG["populate_aid_from_personid"]: for bibrec in relevant_records: update_doclist(bibrec[0], author['id'], bibrec[1]) elif load_records_to_mem_cache([r[0] for r in relevant_records]): for bibrec in relevant_records: update_doclist(bibrec[0], author['id'], bibrec[1]) def load_records_to_mem_cache(bibrec_ids): ''' Loads all the records specified in the list into the memory storage facility. It will try to attach citation information to each record in the process. @param bibrec_ids: list of bibrec IDs to load to memory @type bibrec_ids: list @return: Success (True) or failure (False) of the process @rtype: boolean ''' if not bibrec_ids: return False for bibrec in bibrec_ids: if not bibrec in dat.RELEVANT_RECORDS: rec = get_record(bibrec) if bconfig.LIMIT_AUTHORS_PER_DOCUMENT: is_collaboration = False authors = 0 try: for field in rec['710'][0][0]: if field[0] == 'g': is_collaboration = True break except KeyError: pass if is_collaboration: # If experimentalists shall be excluded uncomment # the following line #continue pass else: try: for field in rec['100'][0][0]: if field[0] == 'a': authors += 1 break except KeyError: pass try: for coauthor in rec['700']: if coauthor[0][0][0] == 'a': authors += 1 except KeyError: pass if authors > bconfig.MAX_AUTHORS_PER_DOCUMENT: continue dat.RELEVANT_RECORDS[bibrec] = rec cites = [] cited_by = [] try: # cites = dat.CITES_DICT[bibrec] cites = get_citation_dict("citationdict")[bibrec] except KeyError: pass try: # cited_by = dat.CITED_BY_DICT[bibrec] cited_by = get_citation_dict("reversedict")[bibrec] except KeyError: pass dat.RELEVANT_RECORDS[bibrec]['cites'] = cites dat.RELEVANT_RECORDS[bibrec]['cited_by'] = cited_by return True def init_authornames(surname, lastname_variations=None): ''' Initializes the AUTHOR_NAMES memory storage @param surname: The surname to search for @type surname: string ''' if len(dat.AUTHOR_NAMES) > 0: existing = [row for row in dat.AUTHOR_NAMES if row['name'].split(",")[0] == surname] if existing: bconfig.LOGGER.log(25, "AUTHOR_NAMES already holds the " "correct data.") else: bconfig.LOGGER.debug("AUTHOR_NAMES will have additional content") for updated in [row for row in dat.AUTHOR_NAMES if not row['processed']]: updated['processed'] = True _perform_authornames_init(surname) else: _perform_authornames_init(surname, lastname_variations) def _perform_authornames_init(surname, lastname_variations=None): ''' Performs the actual AUTHOR_NAMES memory storage init by reading values from the database @param surname: The surname to search for @type surname: string ''' db_authors = None if len(surname) < 4 and not lastname_variations: lastname_variations = [surname] if (not lastname_variations or (lastname_variations and [nm for nm in lastname_variations if nm.count("\\")])): sql_query = (r"SELECT id, name, bibrefs, db_name FROM aidAUTHORNAMES " "WHERE name REGEXP %s") if (lastname_variations and [nm for nm in lastname_variations if nm.count("\\")]): x = sorted(lastname_variations, key=len, reverse=True) # In order to fight escaping problems, we fall back to regexp mode # if we find a backslash somewhere. surname = x[0] # instead of replacing with ' ', this will construct the regex for the # SQL query as well as the next if statements. surname = clean_name_string(surname, replacement="[^0-9a-zA-Z]{0,2}", keep_whitespace=False) if not surname.startswith("[^0-9a-zA-Z]{0,2}"): surname = "[^0-9a-zA-Z]{0,2}%s" % (surname) if not surname.startswith("^"): surname = "^%s" % surname surname = surname + "[^0-9a-zA-Z ]{1,2}" if surname.count("\\"): surname.replace("\\", ".") try: db_authors = run_sql(sql_query, (surname,)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.exception("Not able to select author name: %s" % emsg) else: qnames = [] vari_query = "" for vname in lastname_variations: if vari_query: vari_query += " OR" vari_query += ' name like %s' vname_r = r"""%s""" % vname qnames.append(vname_r + ", %") if not vari_query: return sql_query = ("SELECT id, name, bibrefs, db_name " "FROM aidAUTHORNAMES WHERE" + vari_query) try: db_authors = run_sql(sql_query, tuple(qnames)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.exception("Not able to select author name: %s" % emsg) if not db_authors: return for author in db_authors: dat.AUTHOR_NAMES.append({'id': author[0], 'name': author[1].decode('utf-8'), 'bibrefs': author[2], 'db_name': author[3].decode('utf-8'), 'processed': False}) def find_all_last_names(): ''' Filters out all last names from all names in the database. @return: a list of last names @rtype: list of strings ''' all_names = run_sql("SELECT Name FROM aidAUTHORNAMES") last_names = set() for dbname in all_names: if not dbname: continue full_name = dbname[0] name = split_name_parts(full_name.decode('utf-8'))[0] # For mental sanity, exclude things that are not names... # - Single letter names # - Single number names # - Names containing only numbers and/or symbols # Yes, I know that there are strange names out there! # Yes, I read the 40 misconceptions about names. # Yes, I know! # However, these statistical outlaws are harmful to the data set. artifact_removal = re.compile("[^a-zA-Z0-9]") authorname = None test_name = name if UNIDECODE_ENABLED: test_name = unidecode.unidecode(name) raw_name = artifact_removal.sub("", test_name) if len(raw_name) > 1: authorname = name if not authorname: continue if len(raw_name) > 1: last_names.add(authorname) del(all_names) return list(last_names) def write_mem_cache_to_tables(sanity_checks=False): ''' Reads every memory cache and writes its contents to the appropriate table in the database. @param sanity_checks: Perform sanity checks before inserting (i.e. is the data already present in the db?) and after the insertion (i.e. is the data entered correctly?) @type sanity_checks: boolean ''' ra_id_offset = run_sql("SELECT max(realauthorID) FROM" + " aidREALAUTHORS")[0][0] va_id_offset = run_sql("SELECT max(virtualauthorID) FROM" + " aidVIRTUALAUTHORS")[0][0] cluster_id_offset = run_sql("SELECT max(id) FROM" " aidVIRTUALAUTHORSCLUSTERS")[0][0] if not ra_id_offset or dat.RUNTIME_CONFIG['populate_aid_from_personid']: ra_id_offset = 0 if not va_id_offset: va_id_offset = 0 if not cluster_id_offset: cluster_id_offset = 0 max_va_id = dat.ID_TRACKER["va_id_counter"] if max_va_id <= 1: max_va_id = 2 random_va_id = random.randint(1, max_va_id - 1) va_mem_data = [row['value'] for row in dat.VIRTUALAUTHOR_DATA if (row["virtualauthorid"] == random_va_id and row['tag'] == "orig_authorname_id")][0] if sanity_checks: if va_mem_data: check_on_va = run_sql("SELECT id,virtualauthorID,tag,value FROM aidVIRTUALAUTHORSDATA " "WHERE tag='orig_authorname_id' AND " "value=%s" , (va_mem_data,)) if check_on_va: bconfig.LOGGER.error("Sanity check reported that the data " "exists. We'll skip this record for now. " "Please check the data set manually.") return False bconfig.LOGGER.log(25, "Writing to persistence layer") bconfig.LOGGER.log(25, "Offsets...RA: %s; VA: %s; CL: %s" % (ra_id_offset, va_id_offset, cluster_id_offset)) # batch_max = bconfig.TABLE_POPULATION_BUNCH_SIZE query = [] query_prelude = ("INSERT INTO aidVIRTUALAUTHORSCLUSTERS (cluster_name)" " VALUES (%s)") for va_cluster in dat.VIRTUALAUTHOR_CLUSTERS: encoded_value = None not_encoded_value = va_cluster['clustername'] try: if isinstance(not_encoded_value, unicode): encoded_value = not_encoded_value[0:59].encode('utf-8') elif isinstance(not_encoded_value, str): encoded_value = not_encoded_value[0:59] else: encoded_value = str(not_encoded_value)[0:59] except (UnicodeEncodeError, UnicodeDecodeError), emsg: bconfig.LOGGER.error("Cluster Data encoding error (%s): %s" % (type(not_encoded_value), emsg)) continue query.append((encoded_value,)) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into virtual author " "cluster table failed: %s" % emsg) return False query = [] query_prelude = ("INSERT INTO aidVIRTUALAUTHORSDATA " "(virtualauthorID, tag, value) VALUES " "(%s, %s, %s)") for va_data in dat.VIRTUALAUTHOR_DATA: encoded_value = None not_encoded_value = va_data['value'] try: if isinstance(not_encoded_value, unicode): encoded_value = not_encoded_value[0:254].encode('utf-8') elif isinstance(not_encoded_value, str): encoded_value = not_encoded_value[0:254] else: encoded_value = str(not_encoded_value)[0:254] except (UnicodeEncodeError, UnicodeDecodeError), emsg: bconfig.LOGGER.error("VA Data encoding error (%s): %s" % (type(not_encoded_value), emsg)) continue query.append((va_data['virtualauthorid'] + va_id_offset, va_data['tag'], encoded_value)) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into virtual author " "data table failed: %s" % emsg) return False query = [] query_prelude = ("INSERT INTO aidVIRTUALAUTHORS " "(virtualauthorID, authornamesID, p, clusterID) " "VALUES (%s, %s, %s, %s)") for va_entry in dat.VIRTUALAUTHORS: query.append((va_entry['virtualauthorid'] + va_id_offset, va_entry['authornamesid'], va_entry['p'], va_entry['clusterid'] + cluster_id_offset)) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into virtual author " "table failed: %s" % emsg) return False query = [] query_prelude = ("INSERT INTO aidREALAUTHORDATA " "(realauthorID, tag, value, va_count, " "va_names_p, va_p) VALUES " "(%s, %s, %s, %s, %s, %s)") for ra_data in dat.REALAUTHOR_DATA: if not ra_data['tag'] == 'outgoing_citation': encoded_value = None not_encoded_value = ra_data['value'] try: if isinstance(not_encoded_value, unicode): encoded_value = not_encoded_value[0:254].encode('utf-8') elif isinstance(not_encoded_value, str): encoded_value = not_encoded_value[0:254] else: encoded_value = str(not_encoded_value)[0:254] except (UnicodeEncodeError, UnicodeDecodeError), emsg: bconfig.LOGGER.error("RA Data encoding error (%s): %s" % (type(not_encoded_value), emsg)) continue query.append((ra_data['realauthorid'] + ra_id_offset, ra_data['tag'], encoded_value, ra_data['va_count'], ra_data['va_np'], ra_data['va_p'])) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into real author " "data table failed: %s" % emsg) return False query = [] query_prelude = ("INSERT INTO aidREALAUTHORS " "(realauthorID, virtualauthorID, p) VALUES (%s, %s, %s)") for ra_entry in dat.REALAUTHORS: query.append((ra_entry['realauthorid'] + ra_id_offset, ra_entry['virtualauthorid'] + va_id_offset, ra_entry['p'])) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into real author " "table failed: %s" % emsg) return False query = [] query_prelude = ("INSERT INTO aidDOCLIST " "(bibrecID, processed_author) VALUES (%s, %s)") for doc in dat.DOC_LIST: for processed_author in doc['authornameids']: query.append((doc['bibrecid'], processed_author)) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into doc list " "table failed: %s" % emsg) return False query = [] if sanity_checks: if va_mem_data: check_on_va = run_sql("SELECT id,virtualauthorID,tag,value FROM aidVIRTUALAUTHORSDATA " "WHERE tag='orig_authorname_id' AND " "value=%s" , (va_mem_data,)) if not check_on_va: bconfig.LOGGER.error("Sanity check reported that no data " " exists in the database after writing " " to it.") return False bconfig.LOGGER.log(25, "Everything is now written to the database. " "Thanks. Bye.") return True def get_existing_last_names(): ''' Find all authors that have been processed and written to the database. Extract all last names from this list and return these last names. Especially helpful to exclude these clusters (last names) from a run. @return: list of last names @rtype: list of strings ''' bconfig.LOGGER.log(25, "Reading info about existing authors from database") db_lnames = set() db_names = run_sql("select value from aidVIRTUALAUTHORSDATA where" + " tag='orig_name_string'") for i in db_names: db_lnames.add(i[0].split(',')[0]) del(db_names) return list(db_lnames) def get_len_authornames_bibrefs(): ''' Reads the lengths of authornames and bibrefs. Used to determine if esstential tables already exist. @return: dict({'names':-1, 'bibrefs':-1}) @rtype: dict ''' lengths = {'names':-1, 'bibrefs':-1} if check_and_create_aid_tables(): authornames_len = run_sql("SELECT count(id) from aidAUTHORNAMES") bibrefs_len = run_sql("SELECT count(id) from aidAUTHORNAMESBIBREFS") try: lengths['names'] = int(authornames_len[0][0]) lengths['bibrefs'] = int(bibrefs_len[0][0]) except (ValueError, TypeError): lengths['names'] = -1 lengths['bibrefs'] = -1 return lengths def check_and_create_aid_tables(): ''' Checks if the database tables for Bibauthorid exist. If not, creates them @return: True if tables exist, False if there was an error @rtype: boolean ''' try: if not run_sql("show tables like 'aidAUTHORNAMES';"): return False except (ProgrammingError, OperationalError): return False return True def load_mem_cache_from_tables(): ''' Loads database content for an author's last name cluster into the memory storage facility. @precondition: memory storage facility needs to be loaded with respective authornames data (init_authornames(lastname)) @return: Success (True) or failure (False) of the loading process @rtype: boolean ''' # print "check for authornames mem table" if not dat.AUTHOR_NAMES: return False authornames_ids = [row['id'] for row in dat.AUTHOR_NAMES] if not authornames_ids: return False # print "Building offsets" ra_id_offset = run_sql("SELECT max(realauthorID) FROM" " aidREALAUTHORS")[0][0] va_id_offset = run_sql("SELECT max(virtualauthorID) FROM" " aidVIRTUALAUTHORS")[0][0] cluster_id_offset = run_sql("SELECT max(id) FROM" " aidVIRTUALAUTHORSCLUSTERS")[0][0] dat.set_tracker("raid_counter", ra_id_offset + 1) dat.set_tracker("va_id_counter", va_id_offset + 1) dat.set_tracker("cluster_id", cluster_id_offset + 1) # print "working on authornames ids..." for authornames_id in authornames_ids: db_vas = run_sql("SELECT virtualauthorid, authornamesid, p, clusterid " "from aidVIRTUALAUTHORS WHERE authornamesid = %s", (authornames_id,)) # print "loading VAs for authid %s" % authornames_id db_vas_set = set([row[0] for row in db_vas]) if not db_vas_set: db_vas_set = (-1, -1) else: db_vas_set.add(-1) db_vas_tuple = tuple(db_vas_set) db_ras = run_sql("SELECT realauthorid FROM " "aidREALAUTHORS WHERE virtualauthorid in %s" , (tuple(db_vas_tuple),)) if db_ras: db_ras_set = set([row[0] for row in db_ras]) db_ras_set.add(-1) db_ras_tuple = tuple(db_ras_set) db_ra_vas = run_sql("SELECT virtualauthorid FROM aidREALAUTHORS " "WHERE realauthorid in %s", (db_ras_tuple,)) db_ra_vas_set = set([row[0] for row in db_ra_vas]) db_ra_vas_set.add(-1) db_ras_tuple = tuple(db_ra_vas_set) db_vas_all = run_sql("SELECT virtualauthorid, authornamesid, p, " "clusterid FROM aidVIRTUALAUTHORS WHERE " "virtualauthorid in %s", (db_ras_tuple,)) else: db_vas_all = db_vas for db_va in db_vas_all: dat.VIRTUALAUTHORS.append({'virtualauthorid': db_va[0], 'authornamesid': db_va[1], 'p': db_va[2], 'clusterid': db_va[3]}) if not dat.VIRTUALAUTHORS: # print "No Virtual Authors loaded. None created before." return True # print "Loading clusters" cluster_ids = set([row['clusterid'] for row in dat.VIRTUALAUTHORS]) if not cluster_ids: cluster_ids = (-1, -1) else: cluster_ids.add(-1) db_va_clusters = run_sql("SELECT id, cluster_name FROM " "aidVIRTUALAUTHORSCLUSTERS WHERE id in %s" , (tuple(cluster_ids),)) # print "Storing clusters" for db_va_cluster in db_va_clusters: dat.VIRTUALAUTHOR_CLUSTERS.append({'clusterid': db_va_cluster[0], 'clustername': db_va_cluster[1]}) # print "Loading VA data" va_ids = set([row['virtualauthorid'] for row in dat.VIRTUALAUTHORS]) if not va_ids: va_ids = (-1, -1) else: va_ids.add(-1) # print "Storing VA data" db_va_data = run_sql("SELECT virtualauthorid, tag, value FROM " "aidVIRTUALAUTHORSDATA WHERE virtualauthorid in %s" , (tuple(va_ids),)) for db_va_dat in db_va_data: dat.VIRTUALAUTHOR_DATA.append({'virtualauthorid' : db_va_dat[0], 'tag': db_va_dat[1], 'value': db_va_dat[2]}) # print "Loading RAs" db_ras = run_sql("SELECT realauthorid, virtualauthorid, p FROM " "aidREALAUTHORS WHERE virtualauthorid in %s" , (tuple(va_ids),)) # print "Storing RAs" for db_ra in db_ras: dat.REALAUTHORS.append({'realauthorid': db_ra[0], 'virtualauthorid': db_ra[1], 'p': db_ra[2]}) # print "Loading RA data" ra_ids = set([row['realauthorid'] for row in dat.REALAUTHORS]) if not ra_ids: ra_ids = (-1, -1) else: ra_ids.add(-1) db_ra_data = run_sql("SELECT realauthorid, tag, value, va_count, " "va_names_p, va_p FROM aidREALAUTHORDATA WHERE " "realauthorid in %s", (tuple(ra_ids),)) # print "Storing RA data" for db_ra_dat in db_ra_data: dat.REALAUTHOR_DATA.append({'realauthorid': db_ra_dat[0], 'tag': db_ra_dat[1], 'value': db_ra_dat[2], 'va_count': db_ra_dat[3], 'va_np': db_ra_dat[4], 'va_p': db_ra_dat[5]}) # print "Loading doclist entries" bibrec_ids = set([int(row['value']) for row in dat.REALAUTHOR_DATA if row['tag'] == "bibrec_id"]) if not bibrec_ids: bibrec_ids = (-1, -1) else: bibrec_ids.add(-1) db_doclist = run_sql("SELECT bibrecid, processed_author FROM aidDOCLIST " "WHERE bibrecid in %s", (tuple(bibrec_ids),)) # print "Storing doclist entries" for db_doc in db_doclist: existing_item = [row for row in dat.DOC_LIST if row['bibrecid'] == db_doc[0]] if existing_item: for update in [row for row in dat.DOC_LIST if row['bibrecid'] == db_doc[0]]: if not db_doc[1] in update['authornameids']: update['authornameids'].append(db_doc[1]) else: dat.DOC_LIST.append({'bibrecid': db_doc[0], 'authornameids': [db_doc[1]]}) if set(bibrec_ids).remove(-1): # print "will load recs" if not load_records_to_mem_cache(list(bibrec_ids)): # print" FAILED loading records" return False return True def update_tables_from_mem_cache(sanity_checks=False, return_ra_updates=False): ''' Updates the tables in the database with the information in the memory storage while taking into account only changed data to optimize the time needed for the update. @param sanity_checks: Perform sanity checks while updating--slows down the process but might detect mistakes and prevent harm. Default: False @type sanity_checks: boolean @param return_ra_updates: Will force the method to return a list of real author ids that have been updated. Default: False @type return_ra_updates: boolean @return: Either True if update went through without trouble or False if it did not and a list of updated real authors or an empty list @rtype: tuple of (boolean, list) ''' del_ra_ids = set([-1]) del_va_ids = dat.UPDATES_LOG['deleted_vas'].union( dat.UPDATES_LOG['touched_vas']) if del_va_ids: del_va_ids.add(-1) del_ra_ids_db = run_sql("SELECT realauthorid FROM aidREALAUTHORS " "WHERE virtualauthorid in %s" , (tuple(del_va_ids),)) for ra_id in del_ra_ids_db: del_ra_ids.add(ra_id[0]) if sanity_checks: va_count_db = run_sql("SELECT COUNT(DISTINCT virtualauthorid) " "FROM aidVIRTUALAUTHORS WHERE " "virtualauthorid in %s" , (tuple(del_va_ids),)) try: va_count_db = int(va_count_db[0][0]) except (ValueError, IndexError, TypeError): bconfig.LOGGER.exception("Error while reading number of " "virtual authors in database") va_count_db = -1 if not (va_count_db == len(del_va_ids)): bconfig.LOGGER.error("Sanity checks reported that the number " "of virtual authors in the memory " "storage is not equal to the number of " "virtual authors in the database. " "Aborting update mission.") return (False, []) bconfig.LOGGER.log(25, "Removing updated entries from " "persistence layer") run_sql("DELETE FROM aidVIRTUALAUTHORSDATA " "WHERE virtualauthorid in %s", (tuple(del_va_ids),)) run_sql("DELETE FROM aidVIRTUALAUTHORS " "WHERE virtualauthorid in %s", (tuple(del_va_ids),)) if len(tuple(del_ra_ids)) > 1: run_sql("DELETE FROM aidREALAUTHORDATA " "WHERE realauthorid in %s", (tuple(del_ra_ids),)) run_sql("DELETE FROM aidREALAUTHORS " "WHERE realauthorid in %s", (tuple(del_ra_ids),)) insert_ra_ids = dat.UPDATES_LOG['new_ras'].union(del_ra_ids) insert_va_ids = dat.UPDATES_LOG['new_vas'].union( dat.UPDATES_LOG['touched_vas']) bconfig.LOGGER.log(25, "Writing to persistence layer") ra_id_db_max = run_sql("SELECT max(realauthorID) FROM" " aidREALAUTHORS")[0][0] va_id_db_max = run_sql("SELECT max(virtualauthorID) FROM" " aidVIRTUALAUTHORS")[0][0] cluster_id_db_max = run_sql("SELECT max(id) FROM" " aidVIRTUALAUTHORSCLUSTERS")[0][0] if not ra_id_db_max or not va_id_db_max or not cluster_id_db_max: return (False, []) new_clusters = [row for row in dat.VIRTUALAUTHOR_CLUSTERS if row['clusterid'] > cluster_id_db_max] query = [] if not insert_ra_ids or not insert_va_ids: bconfig.LOGGER.log(25, "Saving update to persistence layer finished " "with success! (There was nothing to do)") return (True, []) query_prelude = ("INSERT INTO aidVIRTUALAUTHORSCLUSTERS (cluster_name)" " VALUES (%s)") for va_cluster in new_clusters: encoded_value = None not_encoded_value = va_cluster['clustername'] try: if isinstance(not_encoded_value, unicode): encoded_value = not_encoded_value[0:59].encode('utf-8') elif isinstance(not_encoded_value, str): encoded_value = not_encoded_value[0:59] else: encoded_value = str(not_encoded_value)[0:59] except (UnicodeEncodeError, UnicodeDecodeError), emsg: bconfig.LOGGER.error("Cluster Data encoding error (%s): %s" % (type(not_encoded_value), emsg)) continue query.append((encoded_value,)) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into virtual author " "cluster table failed: %s" % emsg) return False query = [] va_data_to_insert = [row for row in dat.VIRTUALAUTHOR_DATA if row['virtualauthorid'] in insert_va_ids] if sanity_checks: db_existing_va_ids = run_sql("SELECT COUNT(DISTINCT virtualauthorid) " "WHERE virtualauthorid in %s" , (tuple(insert_va_ids),)) try: db_existing_va_ids = int(va_count_db[0][0]) except (ValueError, IndexError, TypeError): bconfig.LOGGER.exception("Error while reading number of " "virtual authors in database") db_existing_va_ids = -1 if not (db_existing_va_ids == 0): bconfig.LOGGER.error("Sanity checks reported that the " "virtual authors in the memory storage " "that shall be inserted already exist " "in the database. Aborting update mission.") return (False, []) query_prelude = ("INSERT INTO aidVIRTUALAUTHORSDATA " "(virtualauthorID, tag, value) VALUES " "(%s, %s, %s)") for va_data in va_data_to_insert: encoded_value = None not_encoded_value = va_data['value'] try: if isinstance(not_encoded_value, unicode): encoded_value = not_encoded_value[0:254].encode('utf-8') elif isinstance(not_encoded_value, str): encoded_value = not_encoded_value[0:254] else: encoded_value = str(not_encoded_value)[0:254] except (UnicodeEncodeError, UnicodeDecodeError), emsg: bconfig.LOGGER.error("VA Data encoding error (%s): %s" % (type(not_encoded_value), emsg)) continue query.append((va_data['virtualauthorid'], va_data['tag'], encoded_value)) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into virtual author " "data table failed: %s" % emsg) return False query = [] vas_to_insert = [row for row in dat.VIRTUALAUTHORS if row['virtualauthorid'] in insert_va_ids] query_prelude = ("INSERT INTO aidVIRTUALAUTHORS " "(virtualauthorID, authornamesID, p, clusterID) " "VALUES (%s, %s, %s, %s)") for va_entry in vas_to_insert: query.append((va_entry['virtualauthorid'], va_entry['authornamesid'], va_entry['p'], va_entry['clusterid'])) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into virtual author " "table failed: %s" % emsg) return False query = [] if sanity_checks: db_existing_ra_ids = run_sql("SELECT COUNT(DISTINCT realauthorid) " "WHERE realauthorid in %s" , (tuple(insert_ra_ids),)) try: db_existing_ra_ids = int(db_existing_ra_ids[0][0]) except (ValueError, IndexError, TypeError): bconfig.LOGGER.exception("Error while reading number of " "real authors in database") db_existing_va_ids = -1 if not (db_existing_ra_ids == 0): bconfig.LOGGER.error("Sanity checks reported that the " "real authors in the memory storage " "that shall be inserted already exist " "in the database. Aborting update mission.") return (False, []) ra_data_to_insert = [row for row in dat.REALAUTHOR_DATA if row['realauthorid'] in insert_ra_ids] query_prelude = ("INSERT INTO aidREALAUTHORDATA " "(realauthorID, tag, value, va_count, " "va_names_p, va_p) VALUES " "(%s, %s, %s, %s, %s, %s)") for ra_data in ra_data_to_insert: if not ra_data['tag'] == 'outgoing_citation': encoded_value = None not_encoded_value = ra_data['value'] try: if isinstance(not_encoded_value, unicode): encoded_value = not_encoded_value[0:254].encode('utf-8') elif isinstance(not_encoded_value, str): encoded_value = not_encoded_value[0:254] else: encoded_value = str(not_encoded_value)[0:254] except (UnicodeEncodeError, UnicodeDecodeError), emsg: bconfig.LOGGER.error("RA Data encoding error (%s): %s" % (type(not_encoded_value), emsg)) continue query.append((ra_data['realauthorid'], ra_data['tag'], encoded_value, ra_data['va_count'], ra_data['va_np'], ra_data['va_p'])) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into real author " "data table failed: %s" % emsg) return False query = [] query_prelude = ("INSERT INTO aidREALAUTHORS " "(realauthorID, virtualauthorID, p) VALUES (%s, %s, %s)") ras_to_insert = [row for row in dat.REALAUTHORS if row['realauthorid'] in insert_ra_ids] for ra_entry in ras_to_insert: query.append((ra_entry['realauthorid'], ra_entry['virtualauthorid'], ra_entry['p'])) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into real author " "table failed: %s" % emsg) return False query = [] if sanity_checks: db_existing_ra_ids = run_sql("SELECT COUNT(DISTINCT realauthorid) " "WHERE realauthorid in %s" , (tuple(insert_ra_ids),)) try: db_existing_ra_ids = int(db_existing_ra_ids[0][0]) except (ValueError, IndexError, TypeError): bconfig.LOGGER.exception("Error while reading number of " "real authors in database") db_existing_ra_ids = -1 if not (db_existing_ra_ids == len(insert_ra_ids)): bconfig.LOGGER.error("Sanity checks reported that the number of" "real authors in the memory storage " "that shall be inserted is not equal to " "the number of real authors now " "in the database. Aborting update mission.") return (False, []) recid_updates = dat.UPDATES_LOG["rec_updates"] if recid_updates: recid_updates.add(-1) run_sql("DELETE FROM aidDOCLIST WHERE bibrecid in %s" , (tuple(recid_updates),)) doclist_insert = [row for row in dat.DOC_LIST if row['bibrecid'] in dat.UPDATES_LOG["rec_updates"]] query_prelude = ("INSERT INTO aidDOCLIST " "(bibrecID, processed_author) VALUES (%s, %s)") for doc in doclist_insert: for processed_author in doc['authornameids']: query.append((doc['bibrecid'], processed_author)) if query: try: run_sql_many(query_prelude, tuple(query)) except (OperationalError, ProgrammingError), emsg: bconfig.LOGGER.critical("Inserting into doc list " "table failed: %s" % emsg) return False query = [] bconfig.LOGGER.log(25, "Saving update to persistence layer finished " "with success!") if return_ra_updates: ra_ids = [[row['realauthorid']] for row in ras_to_insert] return (True, ra_ids) else: return (True, []) def empty_aid_tables(): ''' Will empty all tables needed for a re-run of the algorithm. Exceptions are aidAUTHORNAMES*, which have to be updated apriori and aidPERSONID, which has to be updated from algorithm after the re-run. ''' run_sql("TRUNCATE `aidDOCLIST`;" "TRUNCATE `aidREALAUTHORDATA`;" "TRUNCATE `aidREALAUTHORS`;" "TRUNCATE `aidVIRTUALAUTHORS`;" "TRUNCATE `aidVIRTUALAUTHORSCLUSTERS`;" "TRUNCATE `aidVIRTUALAUTHORSDATA`;") + +def update_authornames_name_from_dbname(): + try: + import unidecode + UNIDECODE_ENABLED = True + except ImportError: + print("Authorid will run without unidecode support! " + "This is not recommended! Please install unidecode!") + UNIDECODE_ENABLED = False + + + authornames = run_sql("select * from aidAUTHORNAMES") + + for row in authornames: + insert_name = create_normalized_name(split_name_parts(row[3])) + if UNIDECODE_ENABLED: + aid_name = unidecode.unidecode(insert_name) + aid_name = aid_name.replace("\"", "") + else: + aid_name = insert_name + aid_name = aid_name.replace(u"\u201c", "") + aid_name = aid_name.replace(u"\u201d", "") + + print row[0], row[3], "->", aid_name, "(instead of", row[1], ")" + try: + run_sql("update aidAUTHORNAMES set name=%s where id=%s", (aid_name.encode('utf-8'), row[0])) + except Exception, e: + print "ERROR: Could not store", row[0], row[3], "->", aid_name, "(instead of", row[1], ")" + print "ERROR message:", e diff --git a/modules/bibauthorid/lib/bibauthorid_tests.py b/modules/bibauthorid/lib/bibauthorid_tests.py index 7a0cf7491..d3eefd62c 100644 --- a/modules/bibauthorid/lib/bibauthorid_tests.py +++ b/modules/bibauthorid/lib/bibauthorid_tests.py @@ -1,136 +1,136 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. import unittest import bibauthorid_utils as baidu import bibauthorid_authorname_utils as bau from invenio.testutils import make_test_suite, run_test_suite class TestSplitNameParts(unittest.TestCase): """Test for the functionality of splitting name strings in parts""" def test_split_name_parts(self): """bibauthorid - test split name parts""" self.assertEqual(['This', ['I', 'F'], ['Isacorrect', 'Fullname'], [0, 1]], baidu.split_name_parts('This, Isacorrect Fullname')) self.assertEqual(['', [], []], baidu.split_name_parts('')) self.assertEqual(['name', ['F', 'I'], ['Full', 'Inverted'], [0, 1]], baidu.split_name_parts('full inverted name')) self.assertEqual(['Two Words', ['S', 'N'], ['Surname', 'Name'], [0, 1]], baidu.split_name_parts('Two Words, Surname Name')) self.assertEqual(['Strange+)*{ (=]&-$Char', ['N'], ['Name'], [0]], baidu.split_name_parts('Strange+)*{ (=]&-$Char, Name')) class TestCreateUnifiedNames(unittest.TestCase): """Test for the functionality of creation of unified names strings""" def test_create_unified_name(self): """bibauthorid - test creation of unified name strings""" self.assertEqual('this, I. F. ', baidu.create_unified_name('this, isa fullname')) self.assertEqual('fullname, T. I. ', baidu.create_unified_name('this isa fullname')) self.assertEqual(', ', baidu.create_unified_name('')) self.assertEqual('Strange$![+{&]+)= Chars, T. ', baidu.create_unified_name('Strange$![+{&]+)= Chars, Twonames')) class TestCreateNormalizedName(unittest.TestCase): """Test for the functionality of creation of normalized names strings""" def test_create_normalized_name(self): """bibauthorid - test creation of normalized name strings""" self.assertEqual('this, Isa Fullname', baidu.create_normalized_name( baidu.split_name_parts('this, isa fullname'))) self.assertEqual('fullname, This Isa', baidu.create_normalized_name( baidu.split_name_parts('this isa fullname'))) self.assertEqual('Strange&][{}) ==}{$*]!, Name', baidu.create_normalized_name( baidu.split_name_parts('Strange&][{}) ==}{$*]!, Name'))) self.assertEqual(',', baidu.create_normalized_name( baidu.split_name_parts(''))) class TestCleanNameString(unittest.TestCase): """Test for the functionality of creation of cleaned names strings""" def test_clean_name_string(self): """bibauthorid - test cleaning of name strings""" self.assertEqual('this is a full name', baidu.clean_name_string('this is a full name')) self.assertEqual('this is a full ,. pz', baidu.clean_name_string('this is a full ;,.$&[{{}}(=*)+]pz')) self.assertEqual('', baidu.clean_name_string('')) class TestCompareNames(unittest.TestCase): """Test for the functionality of comparison of names strings""" def test_compare_names(self): """bibauthorid - test names comparison funcions""" - self.assertEqual(0.94999999999999996, + self.assertEqual(1.0, bau.compare_names('Ellis, j.', 'Ellis, j.')) self.assertEqual(1.0, bau.compare_names('Ellis, john', 'Ellis, john')) self.assertEqual(1.0, bau.compare_names('John Ellis', 'John Ellis')) # self.assertEqual(0.94999999999999996, # bau.compare_names('J. Ellis','J. Ellis')) self.assertEqual(0.0, bau.compare_names('John Ellis', 'John Mark')) self.assertEqual(0.0, bau.compare_names('Ellis, John', 'Mark, John')) - self.assertEqual(0.0, + self.assertEqual(1.0, bau.compare_names('', '')) TEST_SUITE = make_test_suite(TestSplitNameParts, TestCreateUnifiedNames, TestCreateNormalizedName, TestCleanNameString, TestCompareNames,) if __name__ == "__main__": run_test_suite(TEST_SUITE)