diff --git a/Makefile.am b/Makefile.am index 0304cfb26..a643fa555 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,497 +1,493 @@ # This file is part of Invenio. # Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. confignicedir = $(sysconfdir)/build confignice_SCRIPTS=config.nice SUBDIRS = po config modules scripts EXTRA_DIST = UNINSTALL THANKS RELEASE-NOTES configure-tests.py config.nice.in \ config.rpath CONTRIBUTING.rst Dockerfile docker-compose.yml \ requirements.txt Vagrantfile .inveniorc # current MathJax version and packages # See also modules/miscutil/lib/htmlutils.py (get_mathjax_header) MJV = 2.3 MATHJAX = http://invenio-software.org/download/mathjax/MathJax-v$(MJV).zip # current CKeditor version CKV = 3.6.6 CKEDITOR = ckeditor_$(CKV).zip -# current MediaElement.js version -MEV = master -MEDIAELEMENT = http://github.com/johndyer/mediaelement/zipball/$(MEV) - #for solrutils INVENIO_JAVA_PATH = org/invenio_software/solr solrdirname = apache-solr-3.1.0 solrdir = $(prefix)/lib/$(solrdirname) solrutils_dir=$(CURDIR)/modules/miscutil/lib/solrutils CLASSPATH=.:${solrdir}/dist/solrj-lib/commons-io-1.4.jar:${solrdir}/dist/apache-solr-core-*jar:${solrdir}/contrib/jzlib-1.0.7.jar:${solrdir}/dist/apache-solr-solrj-3.1.0.jar:${solrdir}/dist/solrj-lib/slf4j-api-1.5.5.jar:${solrdir}/dist/*:${solrdir}/contrib/basic-lucene-libs/*:${solrdir}/contrib/analysis-extras/lucene-libs/*:${solrdir}/dist/solrj-lib/* # git-version-get stuff: BUILT_SOURCES = $(top_srcdir)/.version $(top_srcdir)/.version: echo $(VERSION) > $@-t && mv $@-t $@ dist-hook: echo $(VERSION) > $(distdir)/.tarball-version check-upgrade: $(PYTHON) $(top_srcdir)/modules/miscutil/lib/inveniocfg_upgrader.py $(top_srcdir) --upgrade-check kwalitee-check: @$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --stats $(top_srcdir) kwalitee-check-errors-only: @$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-errors $(top_srcdir) kwalitee-check-variables: @$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-variables $(top_srcdir) kwalitee-check-indentation: @$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-indentation $(top_srcdir) kwalitee-check-sql-queries: @$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-sql $(top_srcdir) etags: \rm -f $(top_srcdir)/TAGS (cd $(top_srcdir) && find $(top_srcdir) -name "*.py" -print | xargs etags) install-data-local: for d in / /cache /cache/RTdata /log /tmp /tmp-shared /data /run /tmp-shared/bibencode/jobs/done /tmp-shared/bibedit-cache; do \ mkdir -p $(localstatedir)$$d ; \ done @echo "************************************************************" @echo "** Invenio software has been successfully installed! **" @echo "** **" @echo "** You may proceed to customizing your installation now. **" @echo "************************************************************" install-mathjax-plugin: @echo "***********************************************************" @echo "** Installing MathJax plugin, please wait... **" @echo "***********************************************************" rm -rf /tmp/invenio-mathjax-plugin mkdir /tmp/invenio-mathjax-plugin rm -fr ${prefix}/var/www/MathJax mkdir -p ${prefix}/var/www/MathJax (cd /tmp/invenio-mathjax-plugin && \ wget '$(MATHJAX)' -O mathjax.zip && \ unzip -q mathjax.zip && cd mathjax-MathJax-* && cp -r * \ ${prefix}/var/www/MathJax) rm -fr /tmp/invenio-mathjax-plugin @echo "************************************************************" @echo "** The MathJax plugin was successfully installed. **" @echo "** Please do not forget to properly set the option **" @echo "** CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS and **" @echo "** CFG_WEBSUBMIT_USE_MATHJAX in invenio.conf. **" @echo "************************************************************" uninstall-mathjax-plugin: @rm -rvf ${prefix}/var/www/MathJax @echo "***********************************************************" @echo "** The MathJax plugin was successfully uninstalled. **" @echo "***********************************************************" install-jscalendar-plugin: @echo "***********************************************************" @echo "** Installing jsCalendar plugin, please wait... **" @echo "***********************************************************" rm -rf /tmp/invenio-jscalendar-plugin mkdir /tmp/invenio-jscalendar-plugin (cd /tmp/invenio-jscalendar-plugin && \ wget 'http://www.dynarch.com/static/jscalendar-1.0.zip' && \ unzip -u jscalendar-1.0.zip && \ mkdir -p ${prefix}/var/www/jsCalendar && \ cp jscalendar-1.0/img.gif ${prefix}/var/www/jsCalendar/jsCalendar.gif && \ cp jscalendar-1.0/calendar.js ${prefix}/var/www/jsCalendar/ && \ cp jscalendar-1.0/calendar-setup.js ${prefix}/var/www/jsCalendar/ && \ cp jscalendar-1.0/lang/calendar-en.js ${prefix}/var/www/jsCalendar/ && \ cp jscalendar-1.0/calendar-blue.css ${prefix}/var/www/jsCalendar/) rm -fr /tmp/invenio-jscalendar-plugin @echo "***********************************************************" @echo "** The jsCalendar plugin was successfully installed. **" @echo "***********************************************************" uninstall-jscalendar-plugin: @rm -rvf ${prefix}/var/www/jsCalendar @echo "***********************************************************" @echo "** The jsCalendar plugin was successfully uninstalled. **" @echo "***********************************************************" install-js-test-driver: @echo "*******************************************************" @echo "** Installing js-test-driver, please wait... **" @echo "*******************************************************" mkdir -p $(prefix)/lib/java/js-test-driver && \ cd $(prefix)/lib/java/js-test-driver && \ wget http://invenio-software.org/download/js-test-driver/JsTestDriver-1.3.5.jar -O JsTestDriver.jar uninstall-js-test-driver: @rm -rvf ${prefix}/lib/java/js-test-driver @echo "*********************************************************" @echo "** The js-test-driver was successfully uninstalled. **" @echo "*********************************************************" install-jquery-plugins: @echo "***********************************************************" @echo "** Installing various jQuery plugins, please wait... **" @echo "***********************************************************" mkdir -p ${prefix}/var/www/js mkdir -p $(prefix)/var/www/css (cd ${prefix}/var/www/js && \ wget -O jquery.min.js http://invenio-software.org/download/jquery/jquery-1.7.1.min.js && \ wget -N http://ajax.googleapis.com/ajax/libs/jqueryui/1.8.17/jquery-ui.min.js && \ wget -O jquery.jeditable.mini.js http://invenio-software.org/download/jquery/jquery.jeditable.custom.min.js && \ wget -N https://raw.githubusercontent.com/malsup/form/3.51/jquery.form.js --no-check-certificate && \ - wget -N http://jquery-multifile-plugin.googlecode.com/svn-history/r54/trunk/jquery.MultiFile.pack.js && \ + wget -N http://invenio-software.org/download/jquery/jquery.MultiFile.pack.js && \ wget -O jquery.tablesorter.zip http://invenio-software.org/download/jquery/jquery.tablesorter.20111208.zip && \ wget -O uploadify.zip http://invenio-software.org/download/jquery/uploadify-v2.1.4.zip && \ wget -N http://cdn.datatables.net/1.10.4/js/jquery.dataTables.min.js && \ wget -N http://invenio-software.org/download/jquery/jquery.bookmark.package-1.4.0.zip && \ unzip -u jquery.tablesorter.zip -d tablesorter && \ wget -N http://invenio-software.org/download/jquery/jquery.fileTree-1.01.zip && \ unzip -u jquery.fileTree-1.01.zip && \ rm jquery.fileTree-1.01.zip && \ wget -N http://invenio-software.org/download/jquery/ColVis.min.js && \ mv ColVis.min.js jquery.dataTables.ColVis.min.js && \ rm jquery.tablesorter.zip && \ rm -rf uploadify && \ unzip -u uploadify.zip -d uploadify && \ wget -N http://invenio-software.org/download/jquery/flot-0.6.zip && \ wget -N http://www.csspace.net/tmp/jquery-lightbox-0.5.zip && \ rm -rf jquery-lightbox && \ unzip -u jquery-lightbox-0.5.zip -d jquery-lightbox && \ sed -i 's/images\//\/js\/jquery-lightbox\/images\//g' jquery-lightbox/js/jquery.lightbox-0.5.js && \ rm -rf jquery-lightbox-0.5.zip && \ wget -O jquery-ui-timepicker-addon.js http://invenio-software.org/download/jquery/jquery-ui-timepicker-addon-1.0.3.js && \ unzip -u flot-0.6.zip && \ mv flot/jquery.flot.selection.min.js flot/jquery.flot.min.js flot/excanvas.min.js ./ && \ rm flot-0.6.zip && rm -r flot && \ mv uploadify/swfobject.js ./ && \ mv uploadify/cancel.png uploadify/uploadify.css uploadify/uploadify.allglyphs.swf uploadify/uploadify.fla uploadify/uploadify.swf ../img/ && \ mv uploadify/jquery.uploadify.v2.1.4.min.js ./jquery.uploadify.min.js && \ rm uploadify.zip && rm -r uploadify && \ - wget -N https://github.com/douglascrockford/JSON-js/raw/master/json2.js --no-check-certificate && \ + wget -N http://invenio-software.org/download/jquery/json2.js --no-check-certificate && \ wget -O jquery.hotkeys.js http://invenio-software.org/download/jquery/jquery.hotkeys-0.8.js && \ wget -N http://invenio-software.org/download/jquery/jquery.treeview.zip && \ unzip -u jquery.treeview.zip -d jquery-treeview && \ rm jquery.treeview.zip && \ wget -N http://invenio-software.org/download/jquery/v1.5/js/jquery.ajaxPager.js && \ unzip -u jquery.bookmark.package-1.4.0.zip && \ rm -f jquery.bookmark.ext.* bookmarks-big.png bookmarkBasic.html jquery.bookmark.js jquery.bookmark.pack.js && \ mv bookmarks.png ../img/ && \ mv jquery.bookmark.css ../css/ && \ wget -N --no-check-certificate http://invenio-software.org/download/jquery/jquery.omniwindow.js && \ wget -N --no-check-certificate http://invenio-software.org/download/jquery/jquery.blockUI.js && \ wget -N --no-check-certificate http://invenio-software.org/download/jquery/sly.min.js &&\ wget -N --no-check-certificate http://invenio-software.org/download/jquery/parsley.js &&\ wget -N --no-check-certificate http://invenio-software.org/download/jquery/spin.min.js &&\ rm -f jquery.bookmark.package-1.4.0.zip && \ wget https://cdnjs.cloudflare.com/ajax/libs/handlebars.js/1.3.0/handlebars.min.js && \ wget https://twitter.github.com/typeahead.js/releases/0.10.5/typeahead.bundle.min.js && \ wget https://raw.githubusercontent.com/es-shims/es5-shim/v4.0.3/es5-shim.min.js && \ wget https://raw.githubusercontent.com/es-shims/es5-shim/v4.0.3/es5-shim.map && \ mkdir -p ${prefix}/var/www/img && \ cd ${prefix}/var/www/img && \ - wget -r -np -nH --cut-dirs=4 -A "png,css" -P jquery-ui/themes http://jquery-ui.googlecode.com/svn/tags/1.8.17/themes/base/ && \ - wget -r -np -nH --cut-dirs=4 -A "png,css" -P jquery-ui/themes http://jquery-ui.googlecode.com/svn/tags/1.8.17/themes/smoothness/ && \ - wget -r -np -nH --cut-dirs=4 -A "png,css" -P jquery-ui/themes http://jquery-ui.googlecode.com/svn/tags/1.8.17/themes/redmond/ && \ - wget --no-check-certificate -O datatables_jquery-ui.css https://raw.githubusercontent.com/DataTables/DataTables/1.10.0/media/css/demo_table_jui.css && \ - wget -N http://jquery-ui.googlecode.com/svn/tags/1.8.17/themes/redmond/jquery-ui.css && \ - wget -N http://jquery-ui.googlecode.com/svn/tags/1.8.17/demos/images/calendar.gif && \ - wget -r -np -nH --cut-dirs=5 -A "png" http://jquery-ui.googlecode.com/svn/tags/1.8.17/themes/redmond/images/) + wget http://invenio-software.org/download/jquery/jquery-ui.tar.gz && \ + tar xvfz jquery-ui.tar.gz && \ + rm jquery-ui.tar.gz && \ + wget -O datatables_jquery-ui.css https://raw.githubusercontent.com/DataTables/DataTables/1.10.0/media/css/demo_table_jui.css --no-check-certificate && \ + cp jquery-ui/themes/redmond/jquery-ui.css . && \ + wget -N http://invenio-software.org/download/jquery/v1.5/img/calendar.gif && \ + cp jquery-ui/themes/redmond/images/*.png .) @echo "***********************************************************" @echo "** The jQuery plugins were successfully installed. **" @echo "***********************************************************" uninstall-jquery-plugins: (cd ${prefix}/var/www/js && \ rm -f jquery.min.js && \ rm -f jquery.MultiFile.pack.js && \ rm -f jquery.jeditable.mini.js && \ rm -f jquery.flot.selection.min.js && \ rm -f jquery.flot.min.js && \ rm -f excanvas.min.js && \ rm -f jquery-ui-timepicker-addon.min.js && \ rm -f json2.js && \ rm -f jquery.uploadify.min.js && \ rm -rf tablesorter && \ rm -rf jquery-treeview && \ rm -f jquery.ajaxPager.js && \ rm -f jquery.form.js && \ rm -f jquery.dataTables.min.js && \ rm -f ui.core.js && \ rm -f jquery.bookmark.min.js && \ rm -f handlebars.min.js && \ rm -f typeahead.bundle.min.js && \ rm -f es5-shim.min.js && \ rm -f es5-shim.map && \ rm -f jquery.dataTables.ColVis.min.js && \ rm -f jquery.hotkeys.js && \ rm -f jquery.tablesorter.min.js && \ rm -f jquery-ui-1.7.3.custom.min.js && \ rm -f jquery.metadata.js && \ rm -f jquery-latest.js && \ rm -f jquery-ui.min.js && \ rm -rf jquery-lightbox && \ rm -f jquery-ui-timepicker-addon.js && \ rm -rf jqueryFileTree && \ rm -f swfobject.js && \ rm -f jquery.blockUI.js && \ rm -f sly.min.js && \ rm -f parsley.js && \ rm -f spin.min.js && \ rm -f jquery.omniwindow.js) (cd ${prefix}/var/www/img && \ rm -f cancel.png uploadify.css uploadify.swf uploadify.allglyphs.swf uploadify.fla && \ rm -f datatables_jquery-ui.css \ rm -f bookmarks.png \ rm -f demo_table_jui.css \ rm -f calendar.gif \ rm -rf jquery-ui/themes) && \ (cd ${prefix}/var/www/css && \ rm -f jquery.bookmark.css) @echo "***********************************************************" @echo "** The jquery plugins were successfully uninstalled. **" @echo "***********************************************************" install-ckeditor-plugin: @echo "***********************************************************" @echo "** Installing CKeditor plugin, please wait... **" @echo "***********************************************************" rm -rf ${prefix}/lib/python/invenio/ckeditor/ rm -rf /tmp/invenio-ckeditor-plugin mkdir /tmp/invenio-ckeditor-plugin (cd /tmp/invenio-ckeditor-plugin && \ wget 'http://invenio-software.org/download/ckeditor/$(CKEDITOR)' && \ unzip -u -d ${prefix}/var/www $(CKEDITOR)) && \ find ${prefix}/var/www/ckeditor/ -depth -name '_*' -exec rm -rf {} \; && \ find ${prefix}/var/www/ckeditor/ckeditor* -maxdepth 0 ! -name "ckeditor.js" -exec rm -r {} \; && \ rm -fr /tmp/invenio-ckeditor-plugin @echo "* Installing Invenio-specific CKeditor config..." (cd $(top_srcdir)/modules/webstyle/etc && make install) @echo "***********************************************************" @echo "** The CKeditor plugin was successfully installed. **" @echo "** Please do not forget to properly set the option **" @echo "** CFG_WEBCOMMENT_USE_RICH_TEXT_EDITOR in invenio.conf. **" @echo "***********************************************************" uninstall-ckeditor-plugin: @rm -rvf ${prefix}/var/www/ckeditor @rm -rvf ${prefix}/lib/python/invenio/ckeditor @echo "***********************************************************" @echo "** The CKeditor plugin was successfully uninstalled. **" @echo "***********************************************************" install-pdfa-helper-files: @echo "***********************************************************" @echo "** Installing PDF/A helper files, please wait... **" @echo "***********************************************************" wget 'http://invenio-software.org/download/invenio-demo-site-files/ISOCoatedsb.icc' -O ${prefix}/etc/websubmit/file_converter_templates/ISOCoatedsb.icc @echo "***********************************************************" @echo "** The PDF/A helper files were successfully installed. **" @echo "***********************************************************" install-mediaelement: @echo "***********************************************************" @echo "** MediaElement.js, please wait... **" @echo "***********************************************************" rm -rf /tmp/mediaelement mkdir /tmp/mediaelement - wget 'http://github.com/johndyer/mediaelement/zipball/master' -O '/tmp/mediaelement/mediaelement.zip' --no-check-certificate + wget 'http://github.com/johndyer/mediaelement/zipball/2.18.1' -O '/tmp/mediaelement/mediaelement.zip' --no-check-certificate unzip -u -d '/tmp/mediaelement' '/tmp/mediaelement/mediaelement.zip' rm -rf ${prefix}/var/www/mediaelement mkdir ${prefix}/var/www/mediaelement mv /tmp/mediaelement/johndyer-mediaelement-*/build/* ${prefix}/var/www/mediaelement rm -rf /tmp/mediaelement @echo "***********************************************************" @echo "** MediaElement.js was successfully installed. **" @echo "***********************************************************" uninstall-pdfa-helper-files: rm -f ${prefix}/etc/websubmit/file_converter_templates/ISOCoatedsb.icc @echo "***********************************************************" @echo "** The PDF/A helper files were successfully uninstalled. **" @echo "***********************************************************" #Solrutils allows automatic installation, running and searching of an external Solr index. install-solrutils: @echo "***********************************************************" @echo "** Installing Solrutils and solr, please wait... **" @echo "***********************************************************" cd $(prefix)/lib && \ if test -d apache-solr*; then echo A solr directory already exists in `pwd` . \ Please remove it manually, if you are sure it is not needed; exit 2; fi ; \ if test -f apache-solr*; then echo solr tarball already exists in `pwd` . \ Please remove it manually.; exit 2; fi ; \ wget http://archive.apache.org/dist/lucene/solr/3.1.0/apache-solr-3.1.0.tgz && \ tar -xzf apache-solr-3.1.0.tgz && \ rm apache-solr-3.1.0.tgz cd $(solrdir)/contrib/ ;\ wget http://mirrors.ibiblio.org/pub/mirrors/maven2/com/jcraft/jzlib/1.0.7/jzlib-1.0.7.jar && \ cd $(solrdir)/contrib/ ;\ jar -xf ../example/webapps/solr.war WEB-INF/lib/lucene-core-3.1.0.jar ; \ if test -d basic-lucene-libs; then rm -rf basic-lucene-libs; fi ; \ mv WEB-INF/lib/ basic-lucene-libs ; \ cp $(solrutils_dir)/schema.xml $(solrdir)/example/solr/conf/ cp $(solrutils_dir)/solrconfig.xml $(solrdir)/example/solr/conf/ cd $(solrutils_dir) && \ javac -classpath $(CLASSPATH) -d $(solrdir)/contrib @$(solrutils_dir)/java_sources.txt && \ cd $(solrdir)/contrib/ && \ jar -cf invenio-solr.jar org/invenio_software/solr/*class update-v0.99.0-tables: cat $(top_srcdir)/modules/miscutil/sql/tabcreate.sql | grep -v 'INSERT INTO upgrade' | ${prefix}/bin/dbexec echo "DROP TABLE IF EXISTS oaiREPOSITORY;" | ${prefix}/bin/dbexec echo "ALTER TABLE bibdoc ADD COLUMN more_info mediumblob NULL default NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK ADD COLUMN priority tinyint(4) NOT NULL default 0;" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK ADD KEY priority (priority);" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkCITATIONDATA DROP PRIMARY KEY;" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkCITATIONDATA ADD PRIMARY KEY (id);" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkCITATIONDATA CHANGE id id mediumint(8) unsigned NOT NULL auto_increment;" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkCITATIONDATA ADD UNIQUE KEY object_name (object_name);" | ${prefix}/bin/dbexec echo "ALTER TABLE sbmPARAMETERS CHANGE value value text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE sbmAPPROVAL ADD note text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE hstDOCUMENT CHANGE docsize docsize bigint(15) unsigned NOT NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtACTIONHISTORY CHANGE client_host client_host int(10) unsigned default NULL;" | ${prefix}/bin/dbexec update-v0.99.1-tables: @echo "Nothing to do; table structure did not change between v0.99.1 and v0.99.2." update-v0.99.2-tables: @echo "Nothing to do; table structure did not change between v0.99.2 and v0.99.3." update-v0.99.3-tables: @echo "Nothing to do; table structure did not change between v0.99.3 and v0.99.4." update-v0.99.4-tables: @echo "Nothing to do; table structure did not change between v0.99.4 and v0.99.5." update-v0.99.5-tables: @echo "Nothing to do; table structure did not change between v0.99.5 and v0.99.6." update-v0.99.6-tables: @echo "Nothing to do; table structure did not change between v0.99.6 and v0.99.7." update-v0.99.7-tables: @echo "Nothing to do; table structure did not change between v0.99.7 and v0.99.8." update-v0.99.8-tables: @echo "Nothing to do; table structure did not change between v0.99.8 and v0.99.9." update-v0.99.9-tables: # from v0.99.9 to v1.0.0-rc0 echo "RENAME TABLE oaiARCHIVE TO oaiREPOSITORY;" | ${prefix}/bin/dbexec cat $(top_srcdir)/modules/miscutil/sql/tabcreate.sql | grep -v 'INSERT INTO upgrade' | ${prefix}/bin/dbexec echo "INSERT INTO knwKB (id,name,description,kbtype) SELECT id,name,description,'' FROM fmtKNOWLEDGEBASES;" | ${prefix}/bin/dbexec echo "INSERT INTO knwKBRVAL (id,m_key,m_value,id_knwKB) SELECT id,m_key,m_value,id_fmtKNOWLEDGEBASES FROM fmtKNOWLEDGEBASEMAPPINGS;" | ${prefix}/bin/dbexec echo "ALTER TABLE sbmPARAMETERS CHANGE name name varchar(40) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE bibdoc CHANGE docname docname varchar(250) COLLATE utf8_bin NOT NULL default 'file';" | ${prefix}/bin/dbexec echo "ALTER TABLE bibdoc CHANGE status status text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE bibdoc ADD COLUMN text_extraction_date datetime NOT NULL default '0000-00-00';" | ${prefix}/bin/dbexec echo "ALTER TABLE collection DROP COLUMN restricted;" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK CHANGE host host varchar(255) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE hstTASK CHANGE host host varchar(255) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE bib85x DROP INDEX kv, ADD INDEX kv (value(100));" | ${prefix}/bin/dbexec echo "UPDATE clsMETHOD SET location='http://invenio-software.org/download/invenio-demo-site-files/HEP.rdf' WHERE name='HEP' AND location='';" | ${prefix}/bin/dbexec echo "UPDATE clsMETHOD SET location='http://invenio-software.org/download/invenio-demo-site-files/NASA-subjects.rdf' WHERE name='NASA-subjects' AND location='';" | ${prefix}/bin/dbexec echo "UPDATE accACTION SET name='runoairepository', description='run oairepositoryupdater task' WHERE name='runoaiarchive';" | ${prefix}/bin/dbexec echo "UPDATE accACTION SET name='cfgoaiharvest', description='configure OAI Harvest' WHERE name='cfgbibharvest';" | ${prefix}/bin/dbexec echo "ALTER TABLE accARGUMENT CHANGE value value varchar(255);" | ${prefix}/bin/dbexec echo "UPDATE accACTION SET allowedkeywords='doctype,act,categ' WHERE name='submit';" | ${prefix}/bin/dbexec echo "INSERT INTO accARGUMENT(keyword,value) VALUES ('categ','*');" | ${prefix}/bin/dbexec echo "INSERT INTO accROLE_accACTION_accARGUMENT(id_accROLE,id_accACTION,id_accARGUMENT,argumentlistid) SELECT DISTINCT raa.id_accROLE,raa.id_accACTION,accARGUMENT.id,raa.argumentlistid FROM accROLE_accACTION_accARGUMENT as raa JOIN accACTION on id_accACTION=accACTION.id,accARGUMENT WHERE accACTION.name='submit' and accARGUMENT.keyword='categ' and accARGUMENT.value='*';" | ${prefix}/bin/dbexec echo "UPDATE accACTION SET allowedkeywords='name,with_editor_rights' WHERE name='cfgwebjournal';" | ${prefix}/bin/dbexec echo "INSERT INTO accARGUMENT(keyword,value) VALUES ('with_editor_rights','yes');" | ${prefix}/bin/dbexec echo "INSERT INTO accROLE_accACTION_accARGUMENT(id_accROLE,id_accACTION,id_accARGUMENT,argumentlistid) SELECT DISTINCT raa.id_accROLE,raa.id_accACTION,accARGUMENT.id,raa.argumentlistid FROM accROLE_accACTION_accARGUMENT as raa JOIN accACTION on id_accACTION=accACTION.id,accARGUMENT WHERE accACTION.name='cfgwebjournal' and accARGUMENT.keyword='with_editor_rights' and accARGUMENT.value='yes';" | ${prefix}/bin/dbexec echo "ALTER TABLE bskEXTREC CHANGE id id int(15) unsigned NOT NULL auto_increment;" | ${prefix}/bin/dbexec echo "ALTER TABLE bskEXTREC ADD external_id int(15) NOT NULL default '0';" | ${prefix}/bin/dbexec echo "ALTER TABLE bskEXTREC ADD collection_id int(15) unsigned NOT NULL default '0';" | ${prefix}/bin/dbexec echo "ALTER TABLE bskEXTREC ADD original_url text;" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD status char(2) NOT NULL default 'ok';" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD KEY status (status);" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Photos_to_Storage','Attach/edit the pictures uploaded with the \"create_photos_manager_interface()\" function');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFIELDDESC VALUES ('Upload_Photos',NULL,'','R',NULL,NULL,NULL,NULL,NULL,'\"\"\"\r\nThis is an example of element that creates a photos upload interface.\r\nClone it, customize it and integrate it into your submission. Then add function \r\n\'Move_Photos_to_Storage\' to your submission functions list, in order for files \r\nuploaded with this interface to be attached to the record. More information in \r\nthe WebSubmit admin guide.\r\n\"\"\"\r\n\r\nfrom invenio.websubmit_functions.ParamFile import ParamFromFile\r\nfrom invenio.websubmit_functions.Move_Photos_to_Storage import read_param_file, create_photos_manager_interface, get_session_id\r\n\r\n# Retrieve session id\r\ntry:\r\n # User info is defined only in MBI/MPI actions...\r\n session_id = get_session_id(None, uid, user_info) \r\nexcept:\r\n session_id = get_session_id(req, uid, {})\r\n\r\n# Retrieve context\r\nindir = curdir.split(\'/\')[-3]\r\ndoctype = curdir.split(\'/\')[-2]\r\naccess = curdir.split(\'/\')[-1]\r\n\r\n# Get the record ID, if any\r\nsysno = ParamFromFile(\"%s/%s\" % (curdir,\'SN\')).strip()\r\n\r\n\"\"\"\r\nModify below the configuration of the photos manager interface.\r\nNote: \'can_reorder_photos\' parameter is not yet fully taken into consideration\r\n\r\nDocumentation of the function is available by running:\r\necho -e \'from invenio.websubmit_functions.Move_Photos_to_Storage import create_photos_manager_interface as f\\nprint f.__doc__\' | python\r\n\"\"\"\r\ntext += create_photos_manager_interface(sysno, session_id, uid,\r\n doctype, indir, curdir, access,\r\n can_delete_photos=True,\r\n can_reorder_photos=True,\r\n can_upload_photos=True,\r\n editor_width=700,\r\n editor_height=400,\r\n initial_slider_value=100,\r\n max_slider_value=200,\r\n min_slider_value=80)','0000-00-00','0000-00-00',NULL,NULL,0);" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Photos_to_Storage','iconsize');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFIELDDESC VALUES ('Upload_Files',NULL,'','R',NULL,NULL,NULL,NULL,NULL,'\"\"\"\r\nThis is an example of element that creates a file upload interface.\r\nClone it, customize it and integrate it into your submission. Then add function \r\n\'Move_Uploaded_Files_to_Storage\' to your submission functions list, in order for files \r\nuploaded with this interface to be attached to the record. More information in \r\nthe WebSubmit admin guide.\r\n\"\"\"\r\nfrom invenio.websubmit_managedocfiles import create_file_upload_interface\r\nfrom invenio.websubmit_functions.Shared_Functions import ParamFromFile\r\n\r\nindir = ParamFromFile(os.path.join(curdir, \'indir\'))\r\ndoctype = ParamFromFile(os.path.join(curdir, \'doctype\'))\r\naccess = ParamFromFile(os.path.join(curdir, \'access\'))\r\ntry:\r\n sysno = int(ParamFromFile(os.path.join(curdir, \'SN\')).strip())\r\nexcept:\r\n sysno = -1\r\nln = ParamFromFile(os.path.join(curdir, \'ln\'))\r\n\r\n\"\"\"\r\nRun the following to get the list of parameters of function \'create_file_upload_interface\':\r\necho -e \'from invenio.websubmit_managedocfiles import create_file_upload_interface as f\\nprint f.__doc__\' | python\r\n\"\"\"\r\ntext = create_file_upload_interface(recid=sysno,\r\n print_outside_form_tag=False,\r\n include_headers=True,\r\n ln=ln,\r\n doctypes_and_desc=[(\'main\',\'Main document\'),\r\n (\'additional\',\'Figure, schema, etc.\')],\r\n can_revise_doctypes=[\'*\'],\r\n can_describe_doctypes=[\'main\'],\r\n can_delete_doctypes=[\'additional\'],\r\n can_rename_doctypes=[\'main\'],\r\n sbm_indir=indir, sbm_doctype=doctype, sbm_access=access)[1]\r\n','0000-00-00','0000-00-00',NULL,NULL,0);" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','forceFileRevision');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Create_Upload_Files_Interface','Display generic interface to add/revise/delete files. To be used before function \"Move_Uploaded_Files_to_Storage\"');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Uploaded_Files_to_Storage','Attach files uploaded with \"Create_Upload_Files_Interface\"')" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','elementNameToDoctype');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','createIconDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','createRelatedFormats');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','iconsize');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','keepPreviousVersionDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Revised_Files_to_Storage','Revise files initially uploaded with \"Move_Files_to_Storage\"')" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','maxsize');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','minsize');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','doctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','restrictions');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canDeleteDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canReviseDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canDescribeDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canCommentDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canKeepDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canAddFormatDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canRestrictDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canRenameDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canNameNewFiles');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','createRelatedFormats');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','keepDefault');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','showLinks');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','fileLabel');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','filenameLabel');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','descriptionLabel');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','commentLabel');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','restrictionLabel');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','startDoc');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','endDoc');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','defaultFilenameDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','maxFilesDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','iconsize');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','createIconDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Report_Number_Generation','nblength');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Second_Report_Number_Generation','2nd_nb_length');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Get_Recid','record_search_pattern');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_FCKeditor_Files_to_Storage','Transfer files attached to the record with the FCKeditor');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_FCKeditor_Files_to_Storage','input_fields');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','layer');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','layer');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','switch_file');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','switch_file');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','paths_and_restrictions');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','paths_and_doctypes');" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD round_name varchar(255) NOT NULL default ''" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD restriction varchar(50) NOT NULL default ''" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD in_reply_to_id_cmtRECORDCOMMENT int(15) unsigned NOT NULL default '0'" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD KEY in_reply_to_id_cmtRECORDCOMMENT (in_reply_to_id_cmtRECORDCOMMENT);" | ${prefix}/bin/dbexec echo "ALTER TABLE bskRECORDCOMMENT ADD in_reply_to_id_bskRECORDCOMMENT int(15) unsigned NOT NULL default '0'" | ${prefix}/bin/dbexec echo "ALTER TABLE bskRECORDCOMMENT ADD KEY in_reply_to_id_bskRECORDCOMMENT (in_reply_to_id_bskRECORDCOMMENT);" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD reply_order_cached_data blob NULL default NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE bskRECORDCOMMENT ADD reply_order_cached_data blob NULL default NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD INDEX (reply_order_cached_data(40));" | ${prefix}/bin/dbexec echo "ALTER TABLE bskRECORDCOMMENT ADD INDEX (reply_order_cached_data(40));" | ${prefix}/bin/dbexec echo -e 'from invenio.webcommentadminlib import migrate_comments_populate_threads_index;\ migrate_comments_populate_threads_index()' | $(PYTHON) echo -e 'from invenio.access_control_firerole import repair_role_definitions;\ repair_role_definitions()' | $(PYTHON) CLEANFILES = *~ *.pyc *.tmp diff --git a/Vagrantfile b/Vagrantfile index 5f90d314a..3d5359b0a 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -1,68 +1,68 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. # Copyright (C) 2016 CERN. # # Invenio is free software; you can redistribute it # and/or modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the # Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, # MA 02111-1307, USA. # # In applying this license, CERN does not # waive the privileges and immunities granted to it by virtue of its status # as an Intergovernmental Organization or submit itself to any jurisdiction. # This Vagrant configuration is suitable for Invenio demo site installation as # governed by `.inveniorc`. It uses separate dedicated VMs for various services # in order to better emulate production environment conditions. You can install # an Invenio demo site by running: # # $ vagrant up --no-parallel # $ vagrant ssh web -c 'source .inveniorc && /vagrant/scripts/create-instance.sh' # $ vagrant ssh web -c 'source .inveniorc && /vagrant/scripts/populate-instance.sh' # $ firefox http://192.168.50.10/record/1 # $ vagrant ssh web -c 'source .inveniorc && sudo -u www-data /opt/invenio/bin/inveniocfg --run-unit-tests' # $ vagrant ssh web -c 'source .inveniorc && sudo -u www-data /opt/invenio/bin/inveniocfg --run-regression-tests --yes-i-know' #OS = 'hfm4/centos6' OS = 'ubuntu/precise64' Vagrant.configure("2") do |config| if Vagrant.has_plugin?("vagrant-cachier") config.cache.scope = :box end config.vm.define "web" do |web| web.vm.box = OS web.vm.hostname = 'web' web.vm.provision "file", source: ".inveniorc", destination: ".inveniorc" web.vm.provision "shell", inline: "source .inveniorc && /vagrant/scripts/provision-web.sh", privileged: false web.vm.network "forwarded_port", guest: 80, host: 80 web.vm.network "forwarded_port", guest: 443, host: 443 web.vm.network "private_network", ip: ENV.fetch('INVENIO_WEB_HOST','192.168.50.10') web.vm.provider :virtualbox do |vb| - vb.customize ["modifyvm", :id, "--memory", "3072"] + vb.customize ["modifyvm", :id, "--memory", "4096"] vb.customize ["modifyvm", :id, "--cpus", 2] end end config.vm.define "mysql" do |mysql| mysql.vm.box = OS mysql.vm.hostname = 'mysql' mysql.vm.provision "file", source: ".inveniorc", destination: ".inveniorc" mysql.vm.provision "shell", inline: "source .inveniorc && /vagrant/scripts/provision-mysql.sh", privileged: false mysql.vm.network "private_network", ip: ENV.fetch('INVENIO_MYSQL_HOST','192.168.50.11') end end diff --git a/modules/bibdocfile/lib/bibdocfile.py b/modules/bibdocfile/lib/bibdocfile.py index 0cada52da..4f645a990 100644 --- a/modules/bibdocfile/lib/bibdocfile.py +++ b/modules/bibdocfile/lib/bibdocfile.py @@ -1,4976 +1,4976 @@ # This file is part of Invenio. -# Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 CERN. +# Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ This module implements the low-level API for dealing with fulltext files. - All the files associated to a I{record} (identified by a I{recid}) can be managed via an instance of the C{BibRecDocs} class. - A C{BibRecDocs} is a wrapper of the list of I{documents} attached to the record. - Each document is represented by an instance of the C{BibDoc} class. - A document is identified by a C{docid} and name (C{docname}). The docname must be unique within the record. A document is the set of all the formats and revisions of a piece of information. - A document has a type called C{doctype} and can have a restriction. - Each physical file, i.e. the concretization of a document into a particular I{version} and I{format} is represented by an instance of the C{BibDocFile} class. - The format is infact the extension of the physical file. - A comment and a description and other information can be associated to a BibDocFile. - A C{bibdoc} is a synonim for a document, while a C{bibdocfile} is a synonim for a physical file. @group Main classes: BibRecDocs,BibDoc,BibDocFile @group Other classes: BibDocMoreInfo,Md5Folder,InvenioBibDocFileError @group Main functions: decompose_file,stream_file,bibdocfile_*,download_url @group Configuration Variables: CFG_* """ __revision__ = "$Id$" import os import re import shutil import filecmp import time import random import socket import urllib2 import urllib import tempfile import cPickle import base64 import binascii import cgi import sys import copy import tarfile if sys.hexversion < 0x2060000: from md5 import md5 else: from hashlib import md5 # pylint: disable=E0611 try: import magic if hasattr(magic, "open"): CFG_HAS_MAGIC = 1 if not hasattr(magic, "MAGIC_MIME_TYPE"): ## Patching RHEL6/CentOS6 version magic.MAGIC_MIME_TYPE = 16 elif hasattr(magic, "Magic"): CFG_HAS_MAGIC = 2 except ImportError: CFG_HAS_MAGIC = 0 from datetime import datetime from mimetypes import MimeTypes from thread import get_ident from weakref import ref from urlparse import urlsplit, parse_qs from invenio import webinterface_handler_config as apache # Let's set a reasonable timeout for URL request (e.g. FFT) socket.setdefaulttimeout(40) if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 from invenio.shellutils import escape_shell_arg, run_shell_command from invenio.dbquery import run_sql, DatabaseError from invenio.errorlib import register_exception from invenio.bibrecord import record_get_field_instances, \ field_get_subfield_values, field_get_subfield_instances, \ encode_for_xml from invenio.urlutils import create_url, make_user_agent_string from invenio.textutils import nice_size from invenio.webuser import collect_user_info from invenio.access_control_engine import acc_authorize_action from invenio.access_control_admin import acc_is_user_in_role, acc_get_role_id from invenio.access_control_firerole import compile_role_definition, acc_firerole_check_user from invenio.access_control_config import SUPERADMINROLE, CFG_WEBACCESS_WARNING_MSGS from invenio.config import CFG_SITE_URL, \ CFG_WEBDIR, CFG_BIBDOCFILE_FILEDIR,\ CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS, \ CFG_BIBDOCFILE_FILESYSTEM_BIBDOC_GROUP_LIMIT, CFG_SITE_SECURE_URL, \ CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, \ CFG_TMPDIR, CFG_TMPSHAREDDIR, CFG_PATH_MD5SUM, \ CFG_WEBSUBMIT_STORAGEDIR, \ CFG_BIBDOCFILE_USE_XSENDFILE, \ CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY, \ CFG_SITE_RECORD, CFG_PYLIBDIR, \ CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS, \ CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE, \ CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES, \ CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING, \ CFG_BIBCATALOG_SYSTEM from invenio.bibcatalog import BIBCATALOG_SYSTEM from invenio.bibdocfile_config import CFG_BIBDOCFILE_ICON_SUBFORMAT_RE, \ CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT, CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS from invenio.pluginutils import PluginContainer import invenio.template def _plugin_bldr(dummy, plugin_code): """Preparing the plugin dictionary structure""" ret = {} ret['create_instance'] = getattr(plugin_code, "create_instance", None) ret['supports'] = getattr(plugin_code, "supports", None) return ret _CFG_BIBDOC_PLUGINS = None def get_plugins(): """ Lazy loading of plugins """ global _CFG_BIBDOC_PLUGINS if _CFG_BIBDOC_PLUGINS is None: _CFG_BIBDOC_PLUGINS = PluginContainer( os.path.join(CFG_PYLIBDIR, 'invenio', 'bibdocfile_plugins', 'bom_*.py'), plugin_builder=_plugin_bldr) return _CFG_BIBDOC_PLUGINS bibdocfile_templates = invenio.template.load('bibdocfile') # The above flag controls whether HTTP range requests are supported or not # when serving static files via Python. This is disabled by default as # it currently breaks support for opening PDF files on Windows platforms # using Acrobat reader brower plugin. CFG_ENABLE_HTTP_RANGE_REQUESTS = False #: block size when performing I/O. CFG_BIBDOCFILE_BLOCK_SIZE = 1024 * 8 #: threshold used do decide when to use Python MD5 of CLI MD5 algorithm. CFG_BIBDOCFILE_MD5_THRESHOLD = 256 * 1024 #: chunks loaded by the Python MD5 algorithm. CFG_BIBDOCFILE_MD5_BUFFER = 1024 * 1024 #: whether to normalize e.g. ".JPEG" and ".jpg" into .jpeg. CFG_BIBDOCFILE_STRONG_FORMAT_NORMALIZATION = False #: flags that can be associated to files. CFG_BIBDOCFILE_AVAILABLE_FLAGS = ( 'PDF/A', 'STAMPED', 'PDFOPT', 'HIDDEN', 'CONVERTED', 'PERFORM_HIDE_PREVIOUS', 'OCRED' ) DBG_LOG_QUERIES = False #: constant used if FFT correct with the obvious meaning. KEEP_OLD_VALUE = 'KEEP-OLD-VALUE' _CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [(re.compile(_regex), _headers) for _regex, _headers in CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS] _mimes = MimeTypes(strict=False) _mimes.suffix_map.update({'.tbz2' : '.tar.bz2'}) _mimes.encodings_map.update({'.bz2' : 'bzip2'}) if CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES: for key, value in CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES.iteritems(): _mimes.add_type(key, value) del key, value _magic_cookies = {} if CFG_HAS_MAGIC == 1: def _get_magic_cookies(): """ @return: a tuple of magic object. @rtype: (MAGIC_NONE, MAGIC_COMPRESS, MAGIC_MIME, MAGIC_COMPRESS + MAGIC_MIME) @note: ... not real magic. Just see: man file(1) """ thread_id = get_ident() if thread_id not in _magic_cookies: _magic_cookies[thread_id] = { magic.MAGIC_NONE: magic.open(magic.MAGIC_NONE), magic.MAGIC_COMPRESS: magic.open(magic.MAGIC_COMPRESS), magic.MAGIC_MIME: magic.open(magic.MAGIC_MIME), magic.MAGIC_COMPRESS + magic.MAGIC_MIME: magic.open(magic.MAGIC_COMPRESS + magic.MAGIC_MIME), magic.MAGIC_MIME_TYPE: magic.open(magic.MAGIC_MIME_TYPE), } for key in _magic_cookies[thread_id].keys(): _magic_cookies[thread_id][key].load() return _magic_cookies[thread_id] elif CFG_HAS_MAGIC == 2: def _magic_wrapper(local_path, mime=True, mime_encoding=False): thread_id = get_ident() if (thread_id, mime, mime_encoding) not in _magic_cookies: magic_object = _magic_cookies[thread_id, mime, mime_encoding] = magic.Magic(mime=mime, mime_encoding=mime_encoding) else: magic_object = _magic_cookies[thread_id, mime, mime_encoding] return magic_object.from_file(local_path) # pylint: disable=E1103 def _generate_extensions(): """ Generate the regular expression to match all the known extensions. @return: the regular expression. @rtype: regular expression object """ _tmp_extensions = _mimes.encodings_map.keys() + \ _mimes.suffix_map.keys() + \ _mimes.types_map[1].keys() + \ CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS extensions = [] for ext in _tmp_extensions: if ext.startswith('.'): extensions.append(ext) else: extensions.append('.' + ext) extensions.sort() extensions.reverse() extensions = set([ext.lower() for ext in extensions]) extensions = '\\' + '$|\\'.join(extensions) + '$' extensions = extensions.replace('+', '\\+') return re.compile(extensions, re.I) #: Regular expression to recognized extensions. _extensions = _generate_extensions() class InvenioBibDocFileError(Exception): """ Exception raised in case of errors related to fulltext files. """ pass class InvenioBibdocfileUnauthorizedURL(InvenioBibDocFileError): """ Exception raised in case of errors related to fulltext files. """ ## NOTE: this is a legacy Exception pass def _val_or_null(val, eq_name = None, q_str = None, q_args = None): """ Auxiliary function helpful while building WHERE clauses of SQL queries that should contain field=val or field is val If optional parameters q_str and q_args are provided, lists are updated if val == None, a statement of the form "eq_name is Null" is returned otherwise, otherwise the function returns a parametrised comparison "eq_name=%s" with val as an argument added to the query args list. Using parametrised queries diminishes the likelihood of having SQL injection. @param val Value to compare with @type val @param eq_name The name of the database column @type eq_name string @param q_str Query string builder - list of clauses that should be connected by AND operator @type q_str list @param q_args Query arguments list. This list will be applied as a second argument of run_sql command @type q_args list @result string of a single part of WHERE clause @rtype string """ res = "" if eq_name != None: res += eq_name if val == None: if eq_name != None: res += " is " res += "NULL" if q_str != None: q_str.append(res) return res else: if eq_name != None: res += "=" res += "%s" if q_str != None: q_str.append(res) if q_args != None: q_args.append(str(val)) return res def _sql_generate_conjunctive_where(to_process): """Generating WHERE clause of a SQL statement, consisting of conjunction of declared terms. Terms are defined by the to_process argument. the method creates appropriate entries different in the case, value should be NULL (None in the list) and in the case of not-none arguments. In the second case, parametrised query is generated decreasing the chance of an SQL-injection. @param to_process List of tuples (value, database_column) @type to_process list""" q_str = [] q_args = [] for entry in to_process: q_str.append(_val_or_null(entry[0], eq_name = entry[1], q_args = q_args)) return (" AND ".join(q_str), q_args) def file_strip_ext(afile, skip_version=False, only_known_extensions=False, allow_subformat=True): """ Strip in the best way the extension from a filename. >>> file_strip_ext("foo.tar.gz") 'foo' >>> file_strip_ext("foo.buz.gz") 'foo.buz' >>> file_strip_ext("foo.buz") 'foo' >>> file_strip_ext("foo.buz", only_known_extensions=True) 'foo.buz' >>> file_strip_ext("foo.buz;1", skip_version=False, ... only_known_extensions=True) 'foo.buz;1' >>> file_strip_ext("foo.gif;icon") 'foo' >>> file_strip_ext("foo.gif:icon", allow_subformat=False) 'foo.gif:icon' @param afile: the path/name of a file. @type afile: string @param skip_version: whether to skip a trailing ";version". @type skip_version: bool @param only_known_extensions: whether to strip out only known extensions or to consider as extension anything that follows a dot. @type only_known_extensions: bool @param allow_subformat: whether to consider also subformats as part of the extension. @type allow_subformat: bool @return: the name/path without the extension (and version). @rtype: string """ if skip_version or allow_subformat: afile = afile.split(';')[0] nextfile = _extensions.sub('', afile) if nextfile == afile and not only_known_extensions: nextfile = os.path.splitext(afile)[0] while nextfile != afile: afile = nextfile nextfile = _extensions.sub('', afile) return nextfile def normalize_format(docformat, allow_subformat=True): """ Normalize the format, e.g. by adding a dot in front. @param format: the format/extension to be normalized. @type format: string @param allow_subformat: whether to consider also subformats as part of the extension. @type allow_subformat: bool @return: the normalized format. @rtype; string """ if not docformat: return '' if allow_subformat: subformat = docformat[docformat.rfind(';'):] docformat = docformat[:docformat.rfind(';')] else: subformat = '' if docformat and docformat[0] != '.': docformat = '.' + docformat if CFG_BIBDOCFILE_STRONG_FORMAT_NORMALIZATION: if docformat not in ('.Z', '.H', '.C', '.CC'): docformat = docformat.lower() docformat = { '.jpg' : '.jpeg', '.htm' : '.html', '.tif' : '.tiff' }.get(docformat, docformat) return docformat + subformat def guess_format_from_url(url): """ Given a URL tries to guess it's extension. Different method will be used, including HTTP HEAD query, downloading the resource and using mime @param url: the URL for which the extension shuld be guessed. @type url: string @return: the recognized extension or '.bin' if it's impossible to recognize it. @rtype: string """ def guess_via_magic(local_path): try: if CFG_HAS_MAGIC == 1: magic_cookie = _get_magic_cookies()[magic.MAGIC_MIME_TYPE] mimetype = magic_cookie.file(local_path) elif CFG_HAS_MAGIC == 2: mimetype = _magic_wrapper(local_path, mime=True, mime_encoding=False) if CFG_HAS_MAGIC: if mimetype in CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING: return normalize_format(CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING[mimetype]) else: return normalize_format(_mimes.guess_extension(mimetype)) except Exception: pass ## Let's try to guess the extension by considering the URL as a filename ext = decompose_file(url, skip_version=True, only_known_extensions=True)[2] if ext.startswith('.'): return ext if is_url_a_local_file(url): ## The URL corresponds to a local file, so we can safely consider ## traditional extensions after the dot. ext = decompose_file(url, skip_version=True, only_known_extensions=False)[2] if ext.startswith('.'): return ext ## No extensions? Let's use Magic. ext = guess_via_magic(url) if ext: return ext else: ## Since the URL is remote, let's try to perform a HEAD request ## and see the corresponding headers try: response = open_url(url, head_request=True) except (InvenioBibdocfileUnauthorizedURL, urllib2.URLError): return ".bin" ext = get_format_from_http_response(response) if ext: return ext if CFG_HAS_MAGIC: ## Last solution: let's download the remote resource ## and use the Python magic library to guess the extension filename = "" try: try: filename = download_url(url, docformat='') ext = guess_via_magic(filename) if ext: return ext except Exception: pass finally: if os.path.exists(filename): ## Let's free space os.remove(filename) return ".bin" _docname_re = re.compile(r'[^-\w.]*') def normalize_docname(docname): """ Normalize the docname. At the moment the normalization is just returning the same string. @param docname: the docname to be normalized. @type docname: string @return: the normalized docname. @rtype: string """ #return _docname_re.sub('', docname) return docname def normalize_version(version): """ Normalize the version. The version can be either an integer or the keyword 'all'. Any other value will be transformed into the empty string. @param version: the version (either a number or 'all'). @type version: integer or string @return: the normalized version. @rtype: string """ try: int(version) except ValueError: if version.lower().strip() == 'all': return 'all' else: return '' return str(version) def compose_file(dirname, extension, subformat=None, version=None, storagename=None): """ Construct back a fullpath given the separate components. @param @param storagename Name under which the file should be stored in the filesystem @type storagename string @return a fullpath to the file @rtype string """ if version: version = ";%i" % int(version) else: version = "" if subformat: if not subformat.startswith(";"): subformat = ";%s" % subformat else: subformat = "" if extension and not extension.startswith("."): extension = ".%s" % extension if not storagename: storagename = "content" return os.path.join(dirname, storagename + extension + subformat + version) def compose_format(extension, subformat=None): """ Construct the format string """ if not extension.startswith("."): extension = ".%s" % extension if subformat: if not subformat.startswith(";"): subformat = ";%s" % subformat else: subformat = "" return extension + subformat def decompose_file(afile, skip_version=False, only_known_extensions=False, allow_subformat=True): """ Decompose a file/path into its components dirname, basename and extension. >>> decompose_file('/tmp/foo.tar.gz') ('/tmp', 'foo', '.tar.gz') >>> decompose_file('/tmp/foo.tar.gz;1', skip_version=True) ('/tmp', 'foo', '.tar.gz') >>> decompose_file('http://www.google.com/index.html') ('http://www.google.com', 'index', '.html') @param afile: the path/name of a file. @type afile: string @param skip_version: whether to skip a trailing ";version". @type skip_version: bool @param only_known_extensions: whether to strip out only known extensions or to consider as extension anything that follows a dot. @type only_known_extensions: bool @param allow_subformat: whether to consider also subformats as part of the extension. @type allow_subformat: bool @return: a tuple with the directory name, the basename and extension. @rtype: (dirname, basename, extension) @note: if a URL is provided, the scheme will be part of the dirname. @see: L{file_strip_ext} for the algorithm used to retrieve the extension. """ if skip_version: version = afile.split(';')[-1] try: int(version) afile = afile[:-len(version)-1] except ValueError: pass basename = os.path.basename(afile) dirname = afile[:-len(basename)-1] base = file_strip_ext( basename, only_known_extensions=only_known_extensions, allow_subformat=allow_subformat) extension = basename[len(base) + 1:] if extension: extension = '.' + extension return (dirname, base, extension) def decompose_file_with_version(afile): """ Decompose a file into dirname, basename, extension and version. >>> decompose_file_with_version('/tmp/foo.tar.gz;1') ('/tmp', 'foo', '.tar.gz', 1) @param afile: the path/name of a file. @type afile: string @return: a tuple with the directory name, the basename, extension and version. @rtype: (dirname, basename, extension, version) @raise ValueError: in case version does not exist it will. @note: if a URL is provided, the scheme will be part of the dirname. """ version_str = afile.split(';')[-1] version = int(version_str) afile = afile[:-len(version_str)-1] basename = os.path.basename(afile) dirname = afile[:-len(basename)-1] base = file_strip_ext(basename) extension = basename[len(base) + 1:] if extension: extension = '.' + extension return (dirname, base, extension, version) def get_subformat_from_format(docformat): """ @return the subformat if any. @rtype: string >>> get_subformat_from_format('foo;bar') 'bar' >>> get_subformat_from_format('foo') '' """ try: return docformat[docformat.rindex(';') + 1:] except ValueError: return '' def get_superformat_from_format(docformat): """ @return the superformat if any. @rtype: string >>> get_superformat_from_format('foo;bar') 'foo' >>> get_superformat_from_format('foo') 'foo' """ try: return docformat[:docformat.rindex(';')] except ValueError: return docformat def propose_next_docname(docname): """ Given a I{docname}, suggest a new I{docname} (useful when trying to generate a unique I{docname}). >>> propose_next_docname('foo') 'foo_1' >>> propose_next_docname('foo_1') 'foo_2' >>> propose_next_docname('foo_10') 'foo_11' @param docname: the base docname. @type docname: string @return: the next possible docname based on the given one. @rtype: string """ if '_' in docname: split_docname = docname.split('_') try: split_docname[-1] = str(int(split_docname[-1]) + 1) docname = '_'.join(split_docname) except ValueError: docname += '_1' else: docname += '_1' return docname class BibRecDocs(object): """ This class represents all the files attached to one record. @param recid: the record identifier. @type recid: integer @param deleted_too: whether to consider deleted documents as normal documents (useful when trying to recover deleted information). @type deleted_too: bool @param human_readable: whether numbers should be printed in human readable format (e.g. 2048 bytes -> 2Kb) @ivar id: the record identifier as passed to the constructor. @type id: integer @ivar human_readable: the human_readable flag as passed to the constructor. @type human_readable: bool @ivar deleted_too: the deleted_too flag as passed to the constructor. @type deleted_too: bool @ivar bibdocs: the list of documents attached to the record. @type bibdocs: list of BibDoc """ def __init__(self, recid, deleted_too=False, human_readable=False): try: self.id = int(recid) except ValueError: raise ValueError("BibRecDocs: recid is %s but must be an integer." % repr(recid)) self.human_readable = human_readable self.deleted_too = deleted_too self.attachment_types = {} # dictionary docname->attachment type self._bibdocs = [] self.dirty = True @property def bibdocs(self): if self.dirty: self.build_bibdoc_list() return self._bibdocs def __repr__(self): """ @return: the canonical string representation of the C{BibRecDocs}. @rtype: string """ return 'BibRecDocs(%s%s%s)' % (self.id, self.deleted_too and ', True' or '', self.human_readable and ', True' or '' ) def __str__(self): """ @return: an easy to be I{grepped} string representation of the whole C{BibRecDocs} content. @rtype: string """ out = '%i::::total bibdocs attached=%i\n' % (self.id, len(self.bibdocs)) out += '%i::::total size latest version=%s\n' % (self.id, nice_size(self.get_total_size_latest_version())) out += '%i::::total size all files=%s\n' % (self.id, nice_size(self.get_total_size())) for (docname, (bibdoc, dummy)) in self.bibdocs.items(): out += str(docname) + ":" + str(bibdoc) return out def empty_p(self): """ @return: True when the record has no attached documents. @rtype: bool """ return len(self.bibdocs) == 0 def deleted_p(self): """ @return: True if the correxsponding record has been deleted. @rtype: bool """ from invenio.search_engine import record_exists return record_exists(self.id) == -1 def get_xml_8564(self): """ Return a snippet of I{MARCXML} representing the I{8564} fields corresponding to the current state. @return: the MARCXML representation. @rtype: string """ from invenio.search_engine import get_record out = '' record = get_record(self.id) fields = record_get_field_instances(record, '856', '4', ' ') for field in fields: urls = field_get_subfield_values(field, 'u') if urls and not bibdocfile_url_p(urls[0]): out += '\t\n' for subfield, value in field_get_subfield_instances(field): out += '\t\t%s\n' % (subfield, encode_for_xml(value)) out += '\t\n' for afile in self.list_latest_files(list_hidden=False): out += '\t\n' url = afile.get_url() description = afile.get_description() comment = afile.get_comment() if url: out += '\t\t%s\n' % encode_for_xml(url) if description: out += '\t\t%s\n' % encode_for_xml(description) if comment: out += '\t\t%s\n' % encode_for_xml(comment) out += '\t\n' return out def get_total_size_latest_version(self, user_info=None, subformat=None): """ Returns the total size used on disk by all the files belonging to this record and corresponding to the latest version. @param user_info: the user_info dictionary, used to check restrictions @type: dict @param subformat: if subformat is specified, it limits files only to those from that specific subformat @type subformat: string @return: the total size. @rtype: integer """ size = 0 for (bibdoc, _) in self.bibdocs.values(): size += bibdoc.get_total_size_latest_version(user_info, subformat) return size def get_total_size(self): """ Return the total size used on disk of all the files belonging to this record of any version (not only the last as in L{get_total_size_latest_version}). @return: the total size. @rtype: integer """ size = 0 for (bibdoc, _) in self.bibdocs.values(): size += bibdoc.get_total_size() return size def build_bibdoc_list(self): """ This method must be called everytime a I{bibdoc} is added, removed or modified. """ self._bibdocs = {} if self.deleted_too: res = run_sql("""SELECT brbd.id_bibdoc, brbd.docname, brbd.type FROM bibrec_bibdoc as brbd JOIN bibdoc as bd ON bd.id=brbd.id_bibdoc WHERE brbd.id_bibrec=%s ORDER BY brbd.docname ASC""", (self.id,)) else: res = run_sql("""SELECT brbd.id_bibdoc, brbd.docname, brbd.type FROM bibrec_bibdoc as brbd JOIN bibdoc as bd ON bd.id=brbd.id_bibdoc WHERE brbd.id_bibrec=%s AND bd.status<>'DELETED' ORDER BY brbd.docname ASC""", (self.id,)) for row in res: cur_doc = BibDoc.create_instance(docid=row[0], recid=self.id, human_readable=self.human_readable) self._bibdocs[row[1]] = (cur_doc, row[2]) self.dirty = False def list_bibdocs_by_names(self, doctype=None): """ Returns the dictionary of all bibdocs object belonging to a recid. Keys in the dictionary are names of documetns and values are BibDoc objects. If C{doctype} is set, it returns just the bibdocs of that doctype. @param doctype: the optional doctype. @type doctype: string @return: the dictionary of bibdocs. @rtype: dictionary of Dcname -> BibDoc """ if not doctype: return dict((k, v) for (k, (v, _)) in self.bibdocs.iteritems()) res = {} for (docname, (doc, attachmenttype)) in self.bibdocs.iteritems(): if attachmenttype == doctype: res[docname] = doc return res def list_bibdocs(self, doctype=None, rel_type=None): """ Returns the list all bibdocs object belonging to a recid. If C{doctype} is set, it returns just the bibdocs of that doctype. @param doctype: the optional doctype. @type doctype: string @return: the list of bibdocs. @rtype: list of BibDoc """ return [bibdoc for (bibdoc, rtype) in self.bibdocs.values() if (not doctype or doctype == bibdoc.doctype) and (rel_type is None or rel_type == rtype)] def get_bibdoc_names(self, doctype=None): """ Returns all the names of the documents associated with the bibrec. If C{doctype} is set, restrict the result to all the matching doctype. @param doctype: the optional doctype. @type doctype: string @return: the list of document names. @rtype: list of string """ return [docname for (docname, dummy) in self.list_bibdocs_by_names(doctype).items()] def check_file_exists(self, path, f_format): """ Check if a file with the same content of the file pointed in C{path} is already attached to this record. @param path: the file to be checked against. @type path: string @return: True if a file with the requested content is already attached to the record. @rtype: bool """ size = os.path.getsize(path) # Let's consider all the latest files files = self.list_latest_files() # Let's consider all the latest files with same size potential = [afile for afile in files if afile.get_size() == size and afile.format == f_format] if potential: checksum = calculate_md5(path) # Let's consider all the latest files with the same size and the # same checksum potential = [afile for afile in potential if afile.get_checksum() == checksum] if potential: potential = [afile for afile in potential if filecmp.cmp(afile.get_full_path(), path)] if potential: return True else: # Gosh! How unlucky, same size, same checksum but not same # content! pass return False def propose_unique_docname(self, docname): """ Given C{docname}, return a new docname that is not already attached to the record. @param docname: the reference docname. @type docname: string @return: a docname not already attached. @rtype: string """ docname = normalize_docname(docname) goodname = docname i = 1 while goodname in self.get_bibdoc_names(): i += 1 goodname = "%s_%s" % (docname, i) return goodname def merge_bibdocs(self, docname1, docname2): """ This method merge C{docname2} into C{docname1}. 1. Given all the formats of the latest version of the files attached to C{docname2}, these files are added as new formats into C{docname1}. 2. C{docname2} is marked as deleted. @raise InvenioBibDocFileError: if at least one format in C{docname2} already exists in C{docname1}. (In this case the two bibdocs are preserved) @note: comments and descriptions are also copied. @note: if C{docname2} has a I{restriction}(i.e. if the I{status} is set) and C{docname1} doesn't, the restriction is imported. """ bibdoc1 = self.get_bibdoc(docname1) bibdoc2 = self.get_bibdoc(docname2) ## Check for possibility for bibdocfile in bibdoc2.list_latest_files(): docformat = bibdocfile.get_format() if bibdoc1.format_already_exists_p(docformat): raise InvenioBibDocFileError('Format %s already exists in bibdoc %s of record %s. It\'s impossible to merge bibdoc %s into it.' % (docformat, docname1, self.id, docname2)) ## Importing restriction if needed. restriction1 = bibdoc1.get_status() restriction2 = bibdoc2.get_status() if restriction2 and not restriction1: bibdoc1.set_status(restriction2) ## Importing formats for bibdocfile in bibdoc2.list_latest_files(): docformat = bibdocfile.get_format() comment = bibdocfile.get_comment() description = bibdocfile.get_description() bibdoc1.add_file_new_format(bibdocfile.get_full_path(), description=description, comment=comment, docformat=docformat) ## Finally deleting old bibdoc2 bibdoc2.delete() self.dirty = True def get_docid(self, docname): """ @param docname: the document name. @type docname: string @return: the identifier corresponding to the given C{docname}. @rtype: integer @raise InvenioBibDocFileError: if the C{docname} does not corresponds to a document attached to this record. """ if docname in self.bibdocs: return self.bibdocs[docname][0].id raise InvenioBibDocFileError, "Recid '%s' is not connected with a " \ "docname '%s'" % (self.id, docname) def get_docname(self, docid): """ @param docid: the document identifier. @type docid: integer @return: the name of the document corresponding to the given document identifier. @rtype: string @raise InvenioBibDocFileError: if the C{docid} does not corresponds to a document attached to this record. """ for (docname, (bibdoc, _)) in self.bibdocs.items(): if bibdoc.id == docid: return docname raise InvenioBibDocFileError, "Recid '%s' is not connected with a " \ "docid '%s'" % (self.id, docid) def change_name(self, newname, oldname=None, docid=None): """ Renames document of a given name. @param newname: the new name. @type newname: string @raise InvenioBibDocFileError: if the new name corresponds to a document already attached to the record owning this document. """ if not oldname and not docid: raise StandardError("Trying to rename unspecified document") if not oldname: oldname = self.get_docname(docid) if not docid: docid = self.get_docid(oldname) doc, atttype = self.bibdocs[oldname] newname = normalize_docname(newname) res = run_sql("SELECT id_bibdoc FROM bibrec_bibdoc WHERE id_bibrec=%s AND docname=%s", (self.id, newname)) if res: raise InvenioBibDocFileError, "A bibdoc called %s already exists for recid %s" % (newname, self.id) doc.change_name(self.id, newname) # updating the record structure del self._bibdocs[oldname] self._bibdocs[newname] = (doc, atttype) def has_docname_p(self, docname): """ @param docname: the document name, @type docname: string @return: True if a document with the given name is attached to this record. @rtype: bool """ return docname in self.bibdocs.keys() def get_bibdoc(self, docname): """ @return: the bibdoc with a particular docname associated with this recid""" if docname in self.bibdocs: return self.bibdocs[docname][0] raise InvenioBibDocFileError, "Recid '%s' is not connected with " \ " docname '%s'" % (self.id, docname) def delete_bibdoc(self, docname): """ Deletes the document with the specified I{docname}. @param docname: the document name. @type docname: string """ if docname in self.bibdocs: self.bibdocs[docname][0].delete() self.dirty = True def add_bibdoc(self, doctype="Main", docname='file', never_fail=False): """ Add a new empty document object (a I{bibdoc}) to the list of documents of this record. @param doctype: the document type. @type doctype: string @param docname: the document name. @type docname: string @param never_fail: if True, this procedure will not fail, even if a document with the given name is already attached to this record. In this case a new name will be generated (see L{propose_unique_docname}). @type never_fail: bool @return: the newly created document object. @rtype: BibDoc @raise InvenioBibDocFileError: in case of any error. """ try: docname = normalize_docname(docname) if never_fail: docname = self.propose_unique_docname(docname) if docname in self.get_bibdoc_names(): raise InvenioBibDocFileError, \ "%s has already a bibdoc with docname %s" % (self.id, docname) else: bibdoc = BibDoc.create_instance(recid=self.id, doctype=doctype, docname=docname, human_readable=self.human_readable) self.dirty = True return bibdoc except Exception, e: register_exception() raise InvenioBibDocFileError(str(e)) def add_new_file(self, fullpath, doctype="Main", docname=None, never_fail=False, description=None, comment=None, docformat=None, flags=None, modification_date=None): """ Directly add a new file to this record. Adds a new file with the following policy: - if the C{docname} is not set it is retrieved from the name of the file. - If a bibdoc with the given docname doesn't already exist, it is created and the file is added to it. - It it exist but it doesn't contain the format that is being added, the new format is added. - If the format already exists then if C{never_fail} is True a new bibdoc is created with a similar name but with a progressive number as a suffix and the file is added to it (see L{propose_unique_docname}). @param fullpath: the filesystme path of the document to be added. @type fullpath: string @param doctype: the type of the document. @type doctype: string @param docname: the document name. @type docname: string @param never_fail: if True, this procedure will not fail, even if a document with the given name is already attached to this record. In this case a new name will be generated (see L{propose_unique_docname}). @type never_fail: bool @param description: an optional description of the file. @type description: string @param comment: an optional comment to the file. @type comment: string @param format: the extension of the file. If not specified it will be guessed (see L{guess_format_from_url}). @type format: string @param flags: a set of flags to be associated with the file (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}) @type flags: list of string @return: the elaborated document object. @rtype: BibDoc @raise InvenioBibDocFileError: in case of error. """ if docname is None: docname = decompose_file(fullpath)[1] if docformat is None: docformat = decompose_file(fullpath)[2] docname = normalize_docname(docname) try: bibdoc = self.get_bibdoc(docname) except InvenioBibDocFileError: # bibdoc doesn't already exists! bibdoc = self.add_bibdoc(doctype, docname, False) bibdoc.add_file_new_version(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date) else: try: bibdoc.add_file_new_format(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date) except InvenioBibDocFileError, dummy: # Format already exist! if never_fail: bibdoc = self.add_bibdoc(doctype, docname, True) bibdoc.add_file_new_version(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date) else: raise return bibdoc def add_new_version(self, fullpath, docname=None, description=None, comment=None, docformat=None, flags=None): """ Adds a new file to an already existent document object as a new version. @param fullpath: the filesystem path of the file to be added. @type fullpath: string @param docname: the document name. If not specified it will be extracted from C{fullpath} (see L{decompose_file}). @type docname: string @param description: an optional description for the file. @type description: string @param comment: an optional comment to the file. @type comment: string @param format: the extension of the file. If not specified it will be guessed (see L{guess_format_from_url}). @type format: string @param flags: a set of flags to be associated with the file (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}) @type flags: list of string @return: the elaborated document object. @rtype: BibDoc @raise InvenioBibDocFileError: in case of error. @note: previous files associated with the same document will be considered obsolete. """ if docname is None: docname = decompose_file(fullpath)[1] if docformat is None: docformat = decompose_file(fullpath)[2] if flags is None: flags = [] if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags: flags.append('PDF/A') bibdoc = self.get_bibdoc(docname=docname) bibdoc.add_file_new_version(fullpath, description=description, comment=comment, docformat=docformat, flags=flags) return bibdoc def add_new_format(self, fullpath, docname=None, description=None, comment=None, docformat=None, flags=None, modification_date=None): """ Adds a new file to an already existent document object as a new format. @param fullpath: the filesystem path of the file to be added. @type fullpath: string @param docname: the document name. If not specified it will be extracted from C{fullpath} (see L{decompose_file}). @type docname: string @param description: an optional description for the file. @type description: string @param comment: an optional comment to the file. @type comment: string @param format: the extension of the file. If not specified it will be guessed (see L{guess_format_from_url}). @type format: string @param flags: a set of flags to be associated with the file (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}) @type flags: list of string @return: the elaborated document object. @rtype: BibDoc @raise InvenioBibDocFileError: in case the same format already exists. """ if docname is None: docname = decompose_file(fullpath)[1] if docformat is None: docformat = decompose_file(fullpath)[2] if flags is None: flags = [] if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags: flags.append('PDF/A') bibdoc = self.get_bibdoc(docname=docname) bibdoc.add_file_new_format(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date) return bibdoc def list_latest_files(self, doctype=None, list_hidden=True): """ Returns a list of the latest files. @param doctype: if set, only document of the given type will be listed. @type doctype: string @param list_hidden: if True, will list also files with the C{HIDDEN} flag being set. @type list_hidden: bool @return: the list of latest files. @rtype: list of BibDocFile """ docfiles = [] for bibdoc in self.list_bibdocs(doctype): docfiles += bibdoc.list_latest_files(list_hidden=list_hidden) return docfiles def fix(self, docname): """ Algorithm that transform a broken/old bibdoc into a coherent one. Think of it as being the fsck of BibDocs. - All the files in the bibdoc directory will be renamed according to the document name. Proper .recid, .type, .md5 files will be created/updated. - In case of more than one file with the same format version a new bibdoc will be created in order to put does files. @param docname: the document name that need to be fixed. @type docname: string @return: the list of newly created bibdocs if any. @rtype: list of BibDoc @raise InvenioBibDocFileError: in case of issues that can not be fixed automatically. """ bibdoc = self.get_bibdoc(docname) versions = {} res = [] new_bibdocs = [] # List of files with the same version/format of # existing file which need new bibdoc. counter = 0 zero_version_bug = False if os.path.exists(bibdoc.basedir): from invenio.config import CFG_CERN_SITE, CFG_INSPIRE_SITE, CFG_BIBDOCFILE_AFS_VOLUME_PATTERN, CFG_BIBDOCFILE_AFS_VOLUME_QUOTA if os.path.realpath(bibdoc.basedir).startswith('/afs') and (CFG_CERN_SITE or CFG_INSPIRE_SITE): ## We are on AFS at CERN! Let's allocate directories the CERN/AFS way. E.g. ## $ afs_admin create -q 1000000 /afs/cern.ch/project/cds/files/g40 p.cds.g40 ## NOTE: This might be extended to use low-level OpenAFS CLI tools ## so that this technique could be extended to other AFS users outside CERN. mount_point = os.path.dirname(os.path.realpath(bibdoc.basedir)) if not os.path.exists(mount_point): volume = CFG_BIBDOCFILE_AFS_VOLUME_PATTERN % os.path.basename(mount_point) quota = str(CFG_BIBDOCFILE_AFS_VOLUME_QUOTA) exit_code, stdout, stderr = run_shell_command("afs_admin create -q %s %s %s", (quota, mount_point, volume)) if exit_code or stderr: raise IOError("Error in creating AFS mount point %s with quota %s and volume %s: exit_code=%s. Captured stdout:\n: %s\nCaptured stderr:\n: %s" % (mount_point, quota, volume, exit_code, stdout, stderr)) for filename in os.listdir(bibdoc.basedir): if filename[0] != '.' and ';' in filename: name, version = filename.rsplit(';', 1) try: version = int(version) except ValueError: # Strange name register_exception() raise InvenioBibDocFileError, "A file called %s exists under %s. This is not a valid name. After the ';' there must be an integer representing the file version. Please, manually fix this file either by renaming or by deleting it." % (filename, bibdoc.basedir) if version == 0: zero_version_bug = True docformat = name[len(file_strip_ext(name)):] docformat = normalize_format(docformat) if not versions.has_key(version): versions[version] = {} new_name = 'FIXING-%s-%s' % (str(counter), name) try: shutil.move('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, new_name)) except Exception, e: register_exception() raise InvenioBibDocFileError, "Error in renaming '%s' to '%s': '%s'" % ('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, new_name), e) if versions[version].has_key(docformat): new_bibdocs.append((new_name, version)) else: versions[version][docformat] = new_name counter += 1 elif filename[0] != '.': # Strange name register_exception() raise InvenioBibDocFileError, "A file called %s exists under %s. This is not a valid name. There should be a ';' followed by an integer representing the file version. Please, manually fix this file either by renaming or by deleting it." % (filename, bibdoc.basedir) else: # we create the corresponding storage directory old_umask = os.umask(022) os.makedirs(bibdoc.basedir) # and save the father record id if it exists try: if self.id != "": recid_fd = open("%s/.recid" % bibdoc.basedir, "w") recid_fd.write(str(self.id)) recid_fd.close() if bibdoc.doctype != "": type_fd = open("%s/.type" % bibdoc.basedir, "w") type_fd.write(str(bibdoc.doctype)) type_fd.close() except Exception, e: register_exception() raise InvenioBibDocFileError, e os.umask(old_umask) if not versions: bibdoc.delete() self.dirty = True else: for version, formats in versions.iteritems(): if zero_version_bug: version += 1 for docformat, filename in formats.iteritems(): destination = '%s%s;%i' % (docname, docformat, version) try: shutil.move('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, destination)) except Exception, e: register_exception() raise InvenioBibDocFileError, "Error in renaming '%s' to '%s': '%s'" % ('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, destination), e) try: recid_fd = open("%s/.recid" % bibdoc.basedir, "w") recid_fd.write(str(self.id)) recid_fd.close() type_fd = open("%s/.type" % bibdoc.basedir, "w") type_fd.write(str(bibdoc.doctype)) type_fd.close() except Exception, e: register_exception() raise InvenioBibDocFileError, "Error in creating .recid and .type file for '%s' folder: '%s'" % (bibdoc.basedir, e) res = [] for (filename, version) in new_bibdocs: if zero_version_bug: version += 1 new_bibdoc = self.add_bibdoc(doctype=bibdoc.doctype, docname=docname, never_fail=True) new_bibdoc.add_file_new_format('%s/%s' % (bibdoc.basedir, filename), version) res.append(new_bibdoc) try: os.remove('%s/%s' % (bibdoc.basedir, filename)) except Exception, e: register_exception() raise InvenioBibDocFileError, "Error in removing '%s': '%s'" % ('%s/%s' % (bibdoc.basedir, filename), e) Md5Folder(bibdoc.basedir).update(only_new=False) bibdoc._build_file_list() for (bibdoc, dummyatttype) in self.bibdocs.values(): if not run_sql('SELECT data_value FROM bibdocmoreinfo WHERE id_bibdoc=%s', (bibdoc.id,)): ## Import from MARC only if the bibdoc has never had ## its more_info initialized. try: bibdoc.import_descriptions_and_comments_from_marc() except Exception, e: register_exception() raise InvenioBibDocFileError, "Error in importing description and comment from %s for record %s: %s" % (repr(bibdoc), self.id, e) return res def check_format(self, docname): """ Check for any format related issue. In case L{CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS} is altered or Python version changes, it might happen that a docname contains files which are no more docname + .format ; version, simply because the .format is now recognized (and it was not before, so it was contained into the docname). This algorithm verify if it is necessary to fix (seel L{fix_format}). @param docname: the document name whose formats should be verified. @type docname: string @return: True if format is correct. False if a fix is needed. @rtype: bool @raise InvenioBibDocFileError: in case of any error. """ bibdoc = self.get_bibdoc(docname) correct_docname = decompose_file(docname + '.pdf')[1] if docname != correct_docname: return False for filename in os.listdir(bibdoc.basedir): if not filename.startswith('.'): try: dummy, dummy, docformat, version = decompose_file_with_version(filename) except Exception: raise InvenioBibDocFileError('Incorrect filename "%s" for docname %s for recid %i' % (filename, docname, self.id)) if '%s%s;%i' % (correct_docname, docformat, version) != filename: return False return True def check_duplicate_docnames(self): """ Check wethever the record is connected with at least tho documents with the same name. @return: True if everything is fine. @rtype: bool """ docnames = set() for docname in self.get_bibdoc_names(): if docname in docnames: return False else: docnames.add(docname) return True def uniformize_bibdoc(self, docname): """ This algorithm correct wrong file name belonging to a bibdoc. @param docname: the document name whose formats should be verified. @type docname: string """ bibdoc = self.get_bibdoc(docname) for filename in os.listdir(bibdoc.basedir): if not filename.startswith('.'): try: dummy, dummy, docformat, version = decompose_file_with_version(filename) except ValueError: register_exception(alert_admin=True, prefix= "Strange file '%s' is stored in %s" % (filename, bibdoc.basedir)) else: os.rename(os.path.join(bibdoc.basedir, filename), os.path.join(bibdoc.basedir, '%s%s;%i' % (docname, docformat, version))) Md5Folder(bibdoc.basedir).update() bibdoc.touch('rename') def fix_format(self, docname, skip_check=False): """ Fixes format related inconsistencies. @param docname: the document name whose formats should be verified. @type docname: string @param skip_check: if True assume L{check_format} has already been called and the need for fix has already been found. If False, will implicitly call L{check_format} and skip fixing if no error is found. @type skip_check: bool @return: in case merging two bibdocs is needed but it's not possible. @rtype: bool """ if not skip_check: if self.check_format(docname): return True bibdoc = self.get_bibdoc(docname) correct_docname = decompose_file(docname + '.pdf')[1] need_merge = False if correct_docname != docname: need_merge = self.has_docname_p(correct_docname) if need_merge: proposed_docname = self.propose_unique_docname(correct_docname) run_sql('UPDATE bibdoc SET docname=%s WHERE id=%s', (proposed_docname, bibdoc.id)) self.dirty = True self.uniformize_bibdoc(proposed_docname) try: self.merge_bibdocs(docname, proposed_docname) except InvenioBibDocFileError: return False else: run_sql('UPDATE bibdoc SET docname=%s WHERE id=%s', (correct_docname, bibdoc.id)) self.dirty = True self.uniformize_bibdoc(correct_docname) else: self.uniformize_bibdoc(docname) return True def fix_duplicate_docnames(self, skip_check=False): """ Algotirthm to fix duplicate docnames. If a record is connected with at least two bibdoc having the same docname, the algorithm will try to merge them. @param skip_check: if True assume L{check_duplicate_docnames} has already been called and the need for fix has already been found. If False, will implicitly call L{check_duplicate_docnames} and skip fixing if no error is found. @type skip_check: bool """ if not skip_check: if self.check_duplicate_docnames(): return docnames = set() for bibdoc in self.list_bibdocs(): docname = self.get_docname(bibdoc.id) if docname in docnames: new_docname = self.propose_unique_docname(self.get_docname(bibdoc.id)) self.change_name(docid=bibdoc.id, newname=new_docname) self.merge_bibdocs(docname, new_docname) docnames.add(docname) def get_text(self, extract_text_if_necessary=True): """ @return: concatenated texts of all bibdocs separated by " ": string """ texts = [] for bibdoc in self.list_bibdocs(): if hasattr(bibdoc, 'has_text'): if extract_text_if_necessary and not bibdoc.has_text(require_up_to_date=True): perform_ocr = hasattr(bibdoc, 'is_ocr_required') and bibdoc.is_ocr_required() from invenio.bibtask import write_message write_message("... will extract words from %s %s" % (bibdoc, perform_ocr and 'with OCR' or ''), verbose=2) bibdoc.extract_text(perform_ocr=perform_ocr) texts.append(bibdoc.get_text()) return " ".join(texts) def stream_archive_of_latest_files(self, req, files_size=''): """ Streams the tar archive with all files of a certain file size (that are not restricted or hidden) to the user. File size should be a string that can be compared with the output of BibDocFile.get_subformat() function. @param req: Apache Request Object @type req: Apache Request Object @param files_size: size of the files (they can be defined in bibdocfile_config). Empty string means the original size. @type files_size: string """ # Get the internal size from the user-friendly file size name internal_format = [f[1] for f in CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS if f[0] == files_size] if len(internal_format) < 1: # Incorrect file size return internal_format = internal_format[0] tarname = str(self.id) + "_" + files_size + '.tar' # Select files that user can download (not hidden nor restricted) user_info = collect_user_info(req) req.content_type = "application/x-tar" req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % tarname tar = tarfile.open(fileobj=req, mode='w|') for f in self.list_latest_files(): if f.get_subformat() == internal_format and f.is_restricted(user_info)[0] == 0 and not f.hidden: tar.add(f.get_path(), arcname=f.get_full_name(), recursive=False) tar.close() class BibDoc(object): """ This class represents one document (i.e. a set of files with different formats and with versioning information that consitutes a piece of information. To instanciate a new document, the recid and the docname are mandatory. To instanciate an already existing document, either the recid and docname or the docid alone are sufficient to retrieve it. @param docid: the document identifier. @type docid: integer @param recid: the record identifier of the record to which this document belongs to. If the C{docid} is specified the C{recid} is automatically retrieven from the database. @type recid: integer @param docname: the document name. @type docname: string @param doctype: the document type (used when instanciating a new document). @type doctype: string @param human_readable: whether sizes should be represented in a human readable format. @type human_readable: bool @raise InvenioBibDocFileError: in case of error. """ @staticmethod def create_new_document(doc_type="Main", rec_links=None): if rec_links is None: rec_links = [] status = '' doc_id = run_sql("INSERT INTO bibdoc (status, creation_date, modification_date, doctype) " "values(%s,NOW(),NOW(), %s)", (status, doc_type)) if not doc_id: raise InvenioBibDocFileError, "New docid cannot be created" # creating the representation on disk ... preparing the directory try: BibDoc.prepare_basedir(doc_id) except Exception, e: run_sql('DELETE FROM bibdoc WHERE id=%s', (doc_id, )) register_exception(alert_admin=True) raise InvenioBibDocFileError, e # the object has been created: linking to bibliographical records doc = BibDoc(doc_id) for link in rec_links: if "rec_id" in link and link["rec_id"]: rec_id = link["rec_id"] doc_name = normalize_docname(link["doc_name"]) a_type = link["a_type"] doc.attach_to_record(rec_id, str(a_type), str(doc_name)) return doc_id def __init__(self, docid, human_readable=False, initial_data=None): """Constructor of a bibdoc. At least the docid or the recid/docname pair is needed. specifying recid, docname and doctype without specifying docid results in attaching newly created document to a record """ # docid is known, the document already exists res2 = run_sql("SELECT id_bibrec, type, docname FROM bibrec_bibdoc WHERE id_bibdoc=%s", (docid,)) self.bibrec_types = [(r[0], r[1], r[2]) for r in res2 ] # just in case the result was behaving like tuples but was something else if not res2: # fake attachment self.bibrec_types = [(0, None, "fake_name_for_unattached_document")] if initial_data is None: initial_data = BibDoc._retrieve_data(docid) self._docfiles = [] self.__md5s = None self._related_files = {} self.human_readable = human_readable self.cd = initial_data["cd"] # creation date self.md = initial_data["md"] # modification date self.td = initial_data["td"] # text extraction date # should be moved from here !!!! self.bibrec_links = initial_data["bibrec_links"] self.id = initial_data["id"] self.status = initial_data["status"] self.basedir = initial_data["basedir"] self.doctype = initial_data["doctype"] self.storagename = initial_data["storagename"] # the old docname -> now used as a storage name for old records self.more_info = BibDocMoreInfo(self.id) self.dirty = True self.dirty_related_files = True self.last_action = 'init' def __del__(self): if self.dirty and self.last_action != 'init': ## The object is dirty and we did something more than initializing it self._build_file_list() @property def docfiles(self): if self.dirty: self._build_file_list(self.last_action) self.dirty = False return self._docfiles @property def related_files(self): if self.dirty_related_files: self._build_related_file_list() self.dirty_related_files = False return self._related_files @staticmethod def prepare_basedir(doc_id): """Prepares the directory serving as root of a BibDoc""" basedir = _make_base_dir(doc_id) # we create the corresponding storage directory if not os.path.exists(basedir): from invenio.config import CFG_CERN_SITE, CFG_INSPIRE_SITE, CFG_BIBDOCFILE_AFS_VOLUME_PATTERN, CFG_BIBDOCFILE_AFS_VOLUME_QUOTA if os.path.realpath(basedir).startswith('/afs') and (CFG_CERN_SITE or CFG_INSPIRE_SITE): ## We are on AFS at CERN! Let's allocate directories the CERN/AFS way. E.g. ## $ afs_admin create -q 1000000 /afs/cern.ch/project/cds/files/g40 p.cds.g40 ## NOTE: This might be extended to use low-level OpenAFS CLI tools ## so that this technique could be extended to other AFS users outside CERN. mount_point = os.path.dirname(os.path.realpath(basedir)) if not os.path.exists(mount_point): volume = CFG_BIBDOCFILE_AFS_VOLUME_PATTERN % os.path.basename(mount_point) quota = str(CFG_BIBDOCFILE_AFS_VOLUME_QUOTA) exit_code, stdout, stderr = run_shell_command("afs_admin create -q %s %s %s", (quota, mount_point, volume)) if exit_code or stderr: raise IOError("Error in creating AFS mount point %s with quota %s and volume %s: exit_code=%s. Captured stdout:\n: %s\nCaptured stderr:\n: %s" % (mount_point, quota, volume, exit_code, stdout, stderr)) old_umask = os.umask(022) os.makedirs(basedir) os.umask(old_umask) def _update_additional_info_files(self): """Update the hidden file in the document directory ... the file contains all links to records""" try: reclinks_fd = open("%s/.reclinks" % (self.basedir, ), "w") reclinks_fd.write("RECID DOCNAME TYPE\n") for link in self.bibrec_links: reclinks_fd.write("%(recid)s %(docname)s %(doctype)s\n" % link) reclinks_fd.close() except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError, e @staticmethod def _retrieve_data(docid = None): """ Filling information about a document from the database entry """ container = {} container["bibrec_links"] = [] container["id"] = docid container["basedir"] = _make_base_dir(container["id"]) # retrieving links betwen records and documents res = run_sql("SELECT id_bibrec, type, docname FROM bibrec_bibdoc WHERE id_bibdoc=%s", (str(docid),), 1) if res: for r in res: container["bibrec_links"].append({"recid": r[0], "doctype": r[1], "docname": r[2]}) # gather the other information res = run_sql("SELECT status, creation_date, modification_date, text_extraction_date, doctype, docname FROM bibdoc WHERE id=%s LIMIT 1", (docid,), 1) if res: container["status"] = res[0][0] container["cd"] = res[0][1] container["md"] = res[0][2] container["td"] = res[0][3] container["doctype"] = res[0][4] container["storagename"] = res[0][5] else: # this bibdoc doesn't exist raise InvenioBibDocFileError, "The docid %s does not exist." % docid # retreiving all available formats fprefix = container["storagename"] or "content" try: if CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE: ## We take all extensions from the existing formats in the DB. container["extensions"] = set([ext[0] for ext in run_sql("SELECT format FROM bibdocfsinfo WHERE id_bibdoc=%s", (docid, ))]) else: ## We take all the extensions by listing the directory content, stripping name ## and version. container["extensions"] = set([fname[len(fprefix):].rsplit(";", 1)[0] for fname in filter(lambda x: x.startswith(fprefix), os.listdir(container["basedir"]))]) except OSError: container["extensions"] = [] register_exception() return container @staticmethod def create_instance(docid=None, recid=None, docname=None, doctype='Fulltext', a_type = '', human_readable=False): """ Parameters of an attachement to the record: a_type, recid, docname @param a_type Type of the attachment to the record (by default Main) @type a_type String @param doctype Type of the document itself (by default Fulltext) @type doctype String """ # first try to retrieve existing record based on obtained data data = None extensions = [] if docid is not None: data = BibDoc._retrieve_data(docid) doctype = data["doctype"] extensions = data["extensions"] # Loading an appropriate plugin (by default a generic BibDoc) used_plugin = None for dummy, plugin in get_plugins().iteritems(): if plugin['supports'](doctype, extensions): used_plugin = plugin if not a_type: a_type = doctype or 'Main' if not docid: rec_links = [] if recid: rec_links.append({"rec_id": recid, "doc_name" : docname, "a_type": a_type}) if used_plugin and 'create_new' in used_plugin: docid = used_plugin['create_new'](doctype, rec_links) else: docid = BibDoc.create_new_document(doctype, rec_links) if used_plugin: return used_plugin['create_instance'](docid=docid, human_readable=human_readable, initial_data=data) return BibDoc(docid=docid, human_readable=human_readable, initial_data=data) def attach_to_record(self, recid, a_type, docname): """ Attaches given document to a record given by its identifier. @param recid The identifier of the record @type recid Integer @param a_type Function of a document in the record @type a_type String @param docname Name of a document inside of a record @type docname String """ run_sql("INSERT INTO bibrec_bibdoc (id_bibrec, id_bibdoc, type, docname) VALUES (%s,%s,%s,%s)", (str(recid), str(self.id), a_type, docname)) self._update_additional_info_files() def __repr__(self): """ @return: the canonical string representation of the C{BibDoc}. @rtype: string """ return 'BibDoc(%s, %s, %s)' % (repr(self.id), repr(self.doctype), repr(self.human_readable)) def format_recids(self): """Returns a string representation of related record ids""" if len(self.bibrec_links) == 1: return self.bibrec_links[0]["recid"] return "[" + ",".join([str(el["recid"]) for el in self.bibrec_links]) + "]" def __str__(self): """ @return: an easy to be I{grepped} string representation of the whole C{BibDoc} content. @rtype: string """ recids = self.format_recids() out = '%s:%i:::doctype=%s\n' % (recids, self.id, self.doctype) out += '%s:%i:::status=%s\n' % (recids, self.id, self.status) out += '%s:%i:::basedir=%s\n' % (recids, self.id, self.basedir) out += '%s:%i:::creation date=%s\n' % (recids, self.id, self.cd) out += '%s:%i:::modification date=%s\n' % (recids, self.id, self.md) out += '%s:%i:::text extraction date=%s\n' % (recids, self.id, self.td) out += '%s:%i:::total file attached=%s\n' % (recids, self.id, len(self.docfiles)) if self.human_readable: out += '%s:%i:::total size latest version=%s\n' % (recids, self.id, nice_size(self.get_total_size_latest_version())) out += '%s:%i:::total size all files=%s\n' % (recids, self.id, nice_size(self.get_total_size())) else: out += '%s:%i:::total size latest version=%s\n' % (recids, self.id, self.get_total_size_latest_version()) out += '%s:%i:::total size all files=%s\n' % (recids, self.id, self.get_total_size()) for docfile in self.docfiles: out += str(docfile) return out def get_md5s(self): """ @return: an instance of the Md5Folder class to access MD5 information of the current BibDoc @rtype: Md5Folder """ if self.__md5s is None: self.__md5s = Md5Folder(self.basedir) return self.__md5s md5s = property(get_md5s) def format_already_exists_p(self, docformat): """ @param format: a format to be checked. @type format: string @return: True if a file of the given format already exists among the latest files. @rtype: bool """ docformat = normalize_format(docformat) for afile in self.list_latest_files(): if docformat == afile.get_format(): return True return False def get_status(self): """ @return: the status information. @rtype: string """ return self.status @staticmethod def get_fileprefix(basedir, storagename=None): fname = "%s" % (storagename or "content", ) return os.path.join(basedir, fname ) def get_filepath(self, docformat, version): """ Generaters the path inside of the filesystem where the document should be stored. @param format The format of the document @type format string @param version version to be stored in the file @type version string TODO: this should be completely replaced. File storage (and so, also path building) should be abstracted from BibDoc and be using loadable extensions @param format Format of the document to be stored @type format string @param version Version of the document to be stored @type version String @return Full path to the file encoding a particular version and format of the document @trype string """ return "%s%s;%i" % (BibDoc.get_fileprefix(self.basedir, self.storagename), docformat, version) def get_docname(self): """Obsolete !! (will return empty String for new format documents""" return self.storagename def get_doctype(self, recid): """Retrieves the type of this document in the scope of a given recid""" link_types = [attachement["doctype"] for attachement in self.bibrec_links if str(attachement["recid"]) == str(recid)] if link_types: return link_types[0] return "" def touch(self, action=''): """ Update the modification time of the bibdoc (as in the UNIX command C{touch}). """ run_sql('UPDATE bibdoc SET modification_date=NOW() WHERE id=%s', (self.id, )) self.dirty = True self.last_action = action def change_doctype(self, new_doctype): """ Modify the doctype of a BibDoc """ run_sql('UPDATE bibdoc SET doctype=%s WHERE id=%s', (new_doctype, self.id)) run_sql('UPDATE bibrec_bibdoc SET type=%s WHERE id_bibdoc=%s', (new_doctype, self.id)) self.dirty = True def set_status(self, new_status): """ Set a new status. A document with a status information is a restricted document that can be accessed only to user which as an authorization to the I{viewrestrdoc} WebAccess action with keyword status with value C{new_status}. @param new_status: the new status. If empty the document will be unrestricted. @type new_status: string @raise InvenioBibDocFileError: in case the reserved word 'DELETED' is used. """ if new_status != KEEP_OLD_VALUE: if new_status == 'DELETED': raise InvenioBibDocFileError('DELETED is a reserved word and can not be used for setting the status') run_sql('UPDATE bibdoc SET status=%s WHERE id=%s', (new_status, self.id)) self.status = new_status self.touch('status') def add_file_new_version(self, filename, description=None, comment=None, docformat=None, flags=None, modification_date=None): """ Add a new version of a file. If no physical file is already attached to the document a the given file will have version 1. Otherwise the new file will have the current version number plus one. @param filename: the local path of the file. @type filename: string @param description: an optional description for the file. @type description: string @param comment: an optional comment to the file. @type comment: string @param format: the extension of the file. If not specified it will be retrieved from the filename (see L{decompose_file}). @type format: string @param flags: a set of flags to be associated with the file (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}) @type flags: list of string @raise InvenioBibDocFileError: in case of error. """ latestVersion = self.get_latest_version() if latestVersion == 0: myversion = 1 else: myversion = latestVersion + 1 if os.path.exists(filename): if not os.path.getsize(filename) > 0: raise InvenioBibDocFileError, "%s seems to be empty" % filename if docformat is None: docformat = decompose_file(filename)[2] else: docformat = normalize_format(docformat) destination = self.get_filepath(docformat, myversion) if run_sql("SELECT id_bibdoc FROM bibdocfsinfo WHERE id_bibdoc=%s AND version=%s AND format=%s", (self.id, myversion, docformat)): raise InvenioBibDocFileError("According to the database a file of format %s is already attached to the docid %s" % (docformat, self.id)) try: shutil.copyfile(filename, destination) os.chmod(destination, 0644) if modification_date: # if the modification time of the file needs to be changed update_modification_date_of_file(destination, modification_date) except Exception, e: register_exception() raise InvenioBibDocFileError("Encountered an exception while copying '%s' to '%s': '%s'" % (filename, destination, e)) self.more_info.set_description(description, docformat, myversion) self.more_info.set_comment(comment, docformat, myversion) if flags is None: flags = [] if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags: flags.append('PDF/A') for flag in flags: if flag == 'PERFORM_HIDE_PREVIOUS': for afile in self.list_all_files(): docformat = afile.get_format() version = afile.get_version() if version < myversion: self.more_info.set_flag('HIDDEN', docformat, myversion) else: self.more_info.set_flag(flag, docformat, myversion) else: raise InvenioBibDocFileError("'%s' does not exists!" % filename) self.touch('newversion') Md5Folder(self.basedir).update() just_added_file = self.get_file(docformat, myversion) run_sql("INSERT INTO bibdocfsinfo(id_bibdoc, version, format, last_version, cd, md, checksum, filesize, mime) VALUES(%s, %s, %s, true, %s, %s, %s, %s, %s)", (self.id, myversion, docformat, just_added_file.cd, just_added_file.md, just_added_file.get_checksum(), just_added_file.get_size(), just_added_file.mime)) run_sql("UPDATE bibdocfsinfo SET last_version=false WHERE id_bibdoc=%s AND version<%s", (self.id, myversion)) def add_file_new_format(self, filename, version=None, description=None, comment=None, docformat=None, flags=None, modification_date=None): """ Add a file as a new format. @param filename: the local path of the file. @type filename: string @param version: an optional specific version to which the new format should be added. If None, the last version will be used. @type version: integer @param description: an optional description for the file. @type description: string @param comment: an optional comment to the file. @type comment: string @param format: the extension of the file. If not specified it will be retrieved from the filename (see L{decompose_file}). @type format: string @param flags: a set of flags to be associated with the file (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}) @type flags: list of string @raise InvenioBibDocFileError: if the given format already exists. """ if version is None: version = self.get_latest_version() if version == 0: version = 1 if os.path.exists(filename): if not os.path.getsize(filename) > 0: raise InvenioBibDocFileError, "%s seems to be empty" % filename if docformat is None: docformat = decompose_file(filename)[2] else: docformat = normalize_format(docformat) if run_sql("SELECT id_bibdoc FROM bibdocfsinfo WHERE id_bibdoc=%s AND version=%s AND format=%s", (self.id, version, docformat)): raise InvenioBibDocFileError("According to the database a file of format %s is already attached to the docid %s" % (docformat, self.id)) destination = self.get_filepath(docformat, version) if os.path.exists(destination): raise InvenioBibDocFileError, "A file for docid '%s' already exists for the format '%s'" % (str(self.id), docformat) try: shutil.copyfile(filename, destination) os.chmod(destination, 0644) if modification_date: # if the modification time of the file needs to be changed update_modification_date_of_file(destination, modification_date) except Exception, e: register_exception() raise InvenioBibDocFileError, "Encountered an exception while copying '%s' to '%s': '%s'" % (filename, destination, e) self.more_info.set_comment(comment, docformat, version) self.more_info.set_description(description, docformat, version) if flags is None: flags = [] if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags: flags.append('PDF/A') for flag in flags: if flag != 'PERFORM_HIDE_PREVIOUS': self.more_info.set_flag(flag, docformat, version) else: raise InvenioBibDocFileError, "'%s' does not exists!" % filename Md5Folder(self.basedir).update() self.touch('newformat') just_added_file = self.get_file(docformat, version) run_sql("INSERT INTO bibdocfsinfo(id_bibdoc, version, format, last_version, cd, md, checksum, filesize, mime) VALUES(%s, %s, %s, true, %s, %s, %s, %s, %s)", (self.id, version, docformat, just_added_file.cd, just_added_file.md, just_added_file.get_checksum(), just_added_file.get_size(), just_added_file.mime)) def change_docformat(self, oldformat, newformat): """ Renames a format name on disk and in all BibDoc structures. The change will touch only the last version files. The change will take place only if the newformat doesn't already exist. @param oldformat: the format that needs to be renamed @type oldformat: string @param newformat: the format new name @type newformat: string """ oldformat = normalize_format(oldformat) newformat = normalize_format(newformat) if self.format_already_exists_p(newformat): # same format already exists in the latest files, abort return for bibdocfile in self.list_latest_files(): if bibdocfile.get_format() == oldformat: # change format -> rename x.oldformat -> x.newformat dirname, base, docformat, version = decompose_file_with_version(bibdocfile.get_full_path()) os.rename(bibdocfile.get_full_path(), os.path.join(dirname, '%s%s;%i' %(base, newformat, version))) Md5Folder(self.basedir).update() self.touch('rename') self._sync_to_db() return def purge(self): """ Physically removes all the previous version of the given bibdoc. Everything but the last formats will be erased. """ version = self.get_latest_version() if version > 1: for afile in self.docfiles: if afile.get_version() < version: self.more_info.unset_comment(afile.get_format(), afile.get_version()) self.more_info.unset_description(afile.get_format(), afile.get_version()) for flag in CFG_BIBDOCFILE_AVAILABLE_FLAGS: self.more_info.unset_flag(flag, afile.get_format(), afile.get_version()) try: os.remove(afile.get_full_path()) except Exception, dummy: register_exception() Md5Folder(self.basedir).update() self.touch('purge') run_sql("DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s AND version<%s", (self.id, version)) def expunge(self): """ Physically remove all the traces of a given document. @note: an expunged BibDoc object shouldn't be used anymore or the result might be unpredicted. """ del self.__md5s self.more_info.delete() del self.more_info os.system('rm -rf %s' % escape_shell_arg(self.basedir)) run_sql('DELETE FROM bibrec_bibdoc WHERE id_bibdoc=%s', (self.id, )) run_sql('DELETE FROM bibdoc_bibdoc WHERE id_bibdoc1=%s OR id_bibdoc2=%s', (self.id, self.id)) run_sql('DELETE FROM bibdoc WHERE id=%s', (self.id, )) run_sql('INSERT INTO hstDOCUMENT(action, id_bibdoc, doctimestamp) VALUES("EXPUNGE", %s, NOW())', (self.id, )) run_sql('DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s', (self.id, )) del self._docfiles del self.id del self.cd del self.md del self.td del self.basedir del self.doctype del self.bibrec_links def revert(self, version): """ Revert the document to a given version. All the formats corresponding to that version are copied forward to a new version. @param version: the version to revert to. @type version: integer @raise InvenioBibDocFileError: in case of errors """ version = int(version) docfiles = self.list_version_files(version) if docfiles: self.add_file_new_version(docfiles[0].get_full_path(), description=docfiles[0].get_description(), comment=docfiles[0].get_comment(), docformat=docfiles[0].get_format(), flags=docfiles[0].flags) for docfile in docfiles[1:]: self.add_file_new_format(docfile.filename, description=docfile.get_description(), comment=docfile.get_comment(), docformat=docfile.get_format(), flags=docfile.flags) def import_descriptions_and_comments_from_marc(self, record=None): """ Import descriptions and comments from the corresponding MARC metadata. @param record: the record (if None it will be calculated). @type record: bibrecord recstruct @note: If record is passed it is directly used, otherwise it is retrieved from the MARCXML stored in the database. """ ## Let's get the record from invenio.search_engine import get_record if record is None: record = get_record(self.id) fields = record_get_field_instances(record, '856', '4', ' ') global_comment = None global_description = None local_comment = {} local_description = {} for field in fields: url = field_get_subfield_values(field, 'u') if url: ## Given a url url = url[0] if re.match('%s/%s/[0-9]+/files/' % (CFG_SITE_URL, CFG_SITE_RECORD), url): ## If it is a traditional /CFG_SITE_RECORD/1/files/ one ## We have global description/comment for all the formats description = field_get_subfield_values(field, 'y') if description: global_description = description[0] comment = field_get_subfield_values(field, 'z') if comment: global_comment = comment[0] elif bibdocfile_url_p(url): ## Otherwise we have description/comment per format dummy, docname, docformat = decompose_bibdocfile_url(url) brd = BibRecDocs(self.id) if docname == brd.get_docname(self.id): description = field_get_subfield_values(field, 'y') if description: local_description[docformat] = description[0] comment = field_get_subfield_values(field, 'z') if comment: local_comment[docformat] = comment[0] ## Let's update the tables version = self.get_latest_version() for docfile in self.list_latest_files(): docformat = docfile.get_format() if docformat in local_comment: self.set_comment(local_comment[docformat], docformat, version) else: self.set_comment(global_comment, docformat, version) if docformat in local_description: self.set_description(local_description[docformat], docformat, version) else: self.set_description(global_description, docformat, version) self.dirty = True def get_icon(self, subformat_re=CFG_BIBDOCFILE_ICON_SUBFORMAT_RE, display_hidden=True): """ @param subformat_re: by default the convention is that L{CFG_BIBDOCFILE_ICON_SUBFORMAT_RE} is used as a subformat indicator to mean that a particular format is to be used as an icon. Specifiy a different subformat if you need to use a different convention. @type subformat_re: compiled regular expression @return: the bibdocfile corresponding to CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT or, if this does not exist, the smallest size icon of this document, or None if no icon exists for this document. @rtype: BibDocFile @warning: before I{subformat} were introduced this method was returning a BibDoc, while now is returning a BibDocFile. Check if your client code is compatible with this. """ icons = [] for docfile in self.list_latest_files(list_hidden=display_hidden): subformat = docfile.get_subformat() if subformat.lower() == CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT.lower(): # If it's the default icon subformat, return it return docfile if subformat_re.match(subformat): icons.append((docfile.get_size(), docfile)) if icons: # Sort by size, retrieve the smallest one icons.sort() return icons[0][1] return None def add_icon(self, filename, docformat=None, subformat=CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT, modification_date=None): """ Attaches icon to this document. @param filename: the local filesystem path to the icon. @type filename: string @param format: an optional format for the icon. If not specified it will be calculated after the filesystem path. @type format: string @param subformat: by default the convention is that CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT is used as a subformat indicator to mean that a particular format is to be used as an icon. Specifiy a different subformat if you need to use a different convention. @type subformat: string @raise InvenioBibDocFileError: in case of errors. """ #first check if an icon already exists if not docformat: docformat = decompose_file(filename)[2] if subformat: docformat += ";%s" % subformat self.add_file_new_format(filename, docformat=docformat, modification_date=modification_date) def delete_icon(self, subformat_re=CFG_BIBDOCFILE_ICON_SUBFORMAT_RE): """ @param subformat_re: by default the convention is that L{CFG_BIBDOCFILE_ICON_SUBFORMAT_RE} is used as a subformat indicator to mean that a particular format is to be used as an icon. Specifiy a different subformat if you need to use a different convention. @type subformat: compiled regular expression Removes the icon attached to the document if it exists. """ for docfile in self.list_latest_files(): if subformat_re.match(docfile.get_subformat()): self.delete_file(docfile.get_format(), docfile.get_version()) def change_name(self, recid, newname): """ Renames this document in connection with a given record. @param newname: the new name. @type newname: string @raise InvenioBibDocFileError: if the new name corresponds to a document already attached to the record owning this document or if the name was not changed. """ newname = normalize_docname(newname) res = run_sql("SELECT id_bibdoc FROM bibrec_bibdoc WHERE id_bibrec=%s AND docname=%s", (recid, newname)) if res: raise InvenioBibDocFileError("A bibdoc called %s already exists for recid %s" % (newname, recid)) updated = run_sql("update bibrec_bibdoc set docname=%s where id_bibdoc=%s and id_bibrec=%s", (newname, self.id, recid)) if not updated: raise InvenioBibDocFileError("Docname for bibdoc %s in record %s was not changed" % (self.id, recid)) # docid is known, the document already exists res2 = run_sql("SELECT id_bibrec, type, docname FROM bibrec_bibdoc WHERE id_bibdoc=%s", (self.id,)) ## Refreshing names and types. self.bibrec_types = [(r[0], r[1], r[2]) for r in res2 ] # just in case the result was behaving like tuples but was something else if not res2: # fake attachment self.bibrec_types = [(0, None, "fake_name_for_unattached_document")] self.touch('rename') def set_comment(self, comment, docformat, version=None): """ Updates the comment of a specific format/version of the document. @param comment: the new comment. @type comment: string @param format: the specific format for which the comment should be updated. @type format: string @param version: the specific version for which the comment should be updated. If not specified the last version will be used. @type version: integer """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) self.more_info.set_comment(comment, docformat, version) self.dirty = True def set_description(self, description, docformat, version=None): """ Updates the description of a specific format/version of the document. @param description: the new description. @type description: string @param format: the specific format for which the description should be updated. @type format: string @param version: the specific version for which the description should be updated. If not specified the last version will be used. @type version: integer """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) self.more_info.set_description(description, docformat, version) self.dirty = True def set_flag(self, flagname, docformat, version=None): """ Sets a flag for a specific format/version of the document. @param flagname: a flag from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}. @type flagname: string @param format: the specific format for which the flag should be set. @type format: string @param version: the specific version for which the flag should be set. If not specified the last version will be used. @type version: integer """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) self.more_info.set_flag(flagname, docformat, version) self.dirty = True def has_flag(self, flagname, docformat, version=None): """ Checks if a particular flag for a format/version is set. @param flagname: a flag from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}. @type flagname: string @param format: the specific format for which the flag should be set. @type format: string @param version: the specific version for which the flag should be set. If not specified the last version will be used. @type version: integer @return: True if the flag is set. @rtype: bool """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) return self.more_info.has_flag(flagname, docformat, version) def unset_flag(self, flagname, docformat, version=None): """ Unsets a flag for a specific format/version of the document. @param flagname: a flag from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}. @type flagname: string @param format: the specific format for which the flag should be unset. @type format: string @param version: the specific version for which the flag should be unset. If not specified the last version will be used. @type version: integer """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) self.more_info.unset_flag(flagname, docformat, version) self.dirty = True def get_comment(self, docformat, version=None): """ Retrieve the comment of a specific format/version of the document. @param format: the specific format for which the comment should be retrieved. @type format: string @param version: the specific version for which the comment should be retrieved. If not specified the last version will be used. @type version: integer @return: the comment. @rtype: string """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) return self.more_info.get_comment(docformat, version) def get_description(self, docformat, version=None): """ Retrieve the description of a specific format/version of the document. @param format: the specific format for which the description should be retrieved. @type format: string @param version: the specific version for which the description should be retrieved. If not specified the last version will be used. @type version: integer @return: the description. @rtype: string """ if version is None: version = self.get_latest_version() docformat = normalize_format(docformat) return self.more_info.get_description(docformat, version) def hidden_p(self, docformat, version=None): """ Returns True if the file specified by the given format/version is hidden. @param format: the specific format for which the description should be retrieved. @type format: string @param version: the specific version for which the description should be retrieved. If not specified the last version will be used. @type version: integer @return: True if hidden. @rtype: bool """ if version is None: version = self.get_latest_version() return self.more_info.has_flag('HIDDEN', docformat, version) def get_base_dir(self): """ @return: the base directory on the local filesystem for this document (e.g. C{/soft/cdsweb/var/data/files/g0/123}) @rtype: string """ return self.basedir def get_type(self): """ @return: the type of this document. @rtype: string""" return self.doctype def get_id(self): """ @return: the id of this document. @rtype: integer """ return self.id def get_file(self, docformat, version="", exact_docformat=False): """ Returns a L{BibDocFile} instance of this document corresponding to the specific format and version. @param format: the specific format. @type format: string @param version: the specific version for which the description should be retrieved. If not specified the last version will be used. @type version: integer @param exact_docformat: if True, consider always the complete docformat (including subformat if any) @type exact_docformat: bool @return: the L{BibDocFile} instance. @rtype: BibDocFile """ if version == "": docfiles = self.list_latest_files() else: version = int(version) docfiles = self.list_version_files(version) docformat = normalize_format(docformat) for docfile in docfiles: if (docfile.get_format() == docformat or not docformat): return docfile ## Let's skip the subformat specification and consider just the ## superformat if not exact_docformat: superformat = get_superformat_from_format(docformat) for docfile in docfiles: if get_superformat_from_format(docfile.get_format()) == superformat: return docfile raise InvenioBibDocFileError("No file for doc %i of format '%s', version '%s'" % (self.id, docformat, version)) def list_versions(self): """ @return: the list of existing version numbers for this document. @rtype: list of integer """ versions = [] for docfile in self.docfiles: if not docfile.get_version() in versions: versions.append(docfile.get_version()) versions.sort() return versions def delete(self, recid=None): """ Delete this document. @see: L{undelete} for how to undelete the document. @raise InvenioBibDocFileError: in case of errors. """ try: today = datetime.today() recids = [] if recid: recids = [recid] else: recids = [link["recid"] for link in self.bibrec_links] for rid in recids: brd = BibRecDocs(rid) docname = brd.get_docname(self.id) # if the document is attached to some records brd.change_name(docid=self.id, newname = 'DELETED-%s%s-%s' % (today.strftime('%Y%m%d%H%M%S'), today.microsecond, docname)) run_sql("UPDATE bibdoc SET status='DELETED' WHERE id=%s", (self.id,)) self.status = 'DELETED' except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError, "It's impossible to delete bibdoc %s: %s" % (self.id, e) def deleted_p(self): """ @return: True if this document has been deleted. @rtype: bool """ return self.status == 'DELETED' def empty_p(self): """ @return: True if this document is empty, i.e. it has no bibdocfile connected. @rtype: bool """ return len(self.docfiles) == 0 def undelete(self, previous_status='', recid=None): """ Undelete a deleted file (only if it was actually deleted via L{delete}). The previous C{status}, i.e. the restriction key can be provided. Otherwise the undeleted document will be public. @param previous_status: the previous status the should be restored. @type previous_status: string @raise InvenioBibDocFileError: in case of any error. """ try: run_sql("UPDATE bibdoc SET status=%s WHERE id=%s AND status='DELETED'", (previous_status, self.id)) except Exception, e: raise InvenioBibDocFileError, "It's impossible to undelete bibdoc %s: %s" % (self.id, e) if recid: bibrecdocs = BibRecDocs(recid) docname = bibrecdocs.get_docname(self.id) if docname.startswith('DELETED-'): try: # Let's remove DELETED-20080214144322- in front of the docname original_name = '-'.join(docname.split('-')[2:]) original_name = bibrecdocs.propose_unique_docname(original_name) bibrecdocs.change_name(docid=self.id, newname=original_name) except Exception, e: raise InvenioBibDocFileError, "It's impossible to restore the previous docname %s. %s kept as docname because: %s" % (original_name, docname, e) else: raise InvenioBibDocFileError, "Strange just undeleted docname isn't called DELETED-somedate-docname but %s" % docname def delete_file(self, docformat, version): """ Delete a specific format/version of this document on the filesystem. @param format: the particular format to be deleted. @type format: string @param version: the particular version to be deleted. @type version: integer @note: this operation is not reversible!""" try: afile = self.get_file(docformat, version) except InvenioBibDocFileError: return try: os.remove(afile.get_full_path()) run_sql("DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s AND version=%s AND format=%s", (self.id, afile.get_version(), afile.get_format())) last_version = run_sql("SELECT max(version) FROM bibdocfsinfo WHERE id_bibdoc=%s", (self.id, ))[0][0] if last_version: ## Updating information about last version run_sql("UPDATE bibdocfsinfo SET last_version=true WHERE id_bibdoc=%s AND version=%s", (self.id, last_version)) run_sql("UPDATE bibdocfsinfo SET last_version=false WHERE id_bibdoc=%s AND version<>%s", (self.id, last_version)) except OSError: pass self.touch('delete') def get_history(self): """ @return: a human readable and parsable string that represent the history of this document. @rtype: string """ ret = [] hst = run_sql("""SELECT action, docname, docformat, docversion, docsize, docchecksum, doctimestamp FROM hstDOCUMENT WHERE id_bibdoc=%s ORDER BY doctimestamp ASC""", (self.id, )) for row in hst: ret.append("%s %s '%s', format: '%s', version: %i, size: %s, checksum: '%s'" % (row[6].strftime('%Y-%m-%d %H:%M:%S'), row[0], row[1], row[2], row[3], nice_size(row[4]), row[5])) return ret def _build_file_list(self, context=''): """ Lists all files attached to the bibdoc. This function should be called everytime the bibdoc is modified. As a side effect it log everything that has happened to the bibdocfiles in the log facility, according to the context: "init": means that the function has been called; for the first time by a constructor, hence no logging is performed "": by default means to log every deleted file as deleted and every added file as added; "rename": means that every appearently deleted file is logged as renamef and every new file as renamet. """ def log_action(action, docid, docname, docformat, version, size, checksum, timestamp=''): """Log an action into the bibdoclog table.""" try: if timestamp: run_sql('INSERT INTO hstDOCUMENT(action, id_bibdoc, docname, docformat, docversion, docsize, docchecksum, doctimestamp) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)', (action, docid, docname, docformat, version, size, checksum, timestamp)) else: run_sql('INSERT INTO hstDOCUMENT(action, id_bibdoc, docname, docformat, docversion, docsize, docchecksum, doctimestamp) VALUES(%s, %s, %s, %s, %s, %s, %s, NOW())', (action, docid, docname, docformat, version, size, checksum)) except DatabaseError: register_exception() def make_removed_added_bibdocfiles(previous_file_list): """Internal function for build the log of changed files.""" # Let's rebuild the previous situation old_files = {} for bibdocfile in previous_file_list: old_files[(bibdocfile.name, bibdocfile.format, bibdocfile.version)] = (bibdocfile.size, bibdocfile.checksum, bibdocfile.md) # Let's rebuild the new situation new_files = {} for bibdocfile in self._docfiles: new_files[(bibdocfile.name, bibdocfile.format, bibdocfile.version)] = (bibdocfile.size, bibdocfile.checksum, bibdocfile.md) # Let's subtract from added file all the files that are present in # the old list, and let's add to deleted files that are not present # added file. added_files = dict(new_files) deleted_files = {} for key, value in old_files.iteritems(): if added_files.has_key(key): del added_files[key] else: deleted_files[key] = value return (added_files, deleted_files) if context != ('init', 'init_from_disk'): previous_file_list = list(self._docfiles) res = run_sql("SELECT status, creation_date," "modification_date FROM bibdoc WHERE id=%s", (self.id,)) self.cd = res[0][1] self.md = res[0][2] self.status = res[0][0] self.more_info = BibDocMoreInfo(self.id) self._docfiles = [] if CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE and context == 'init': ## In normal init context we read from DB res = run_sql("SELECT version, format, cd, md, checksum, filesize FROM bibdocfsinfo WHERE id_bibdoc=%s", (self.id, )) for version, docformat, cd, md, checksum, size in res: filepath = self.get_filepath(docformat, version) self._docfiles.append(BibDocFile( filepath, self.bibrec_types, version, docformat, self.id, self.status, checksum, self.more_info, human_readable=self.human_readable, cd=cd, md=md, size=size, bibdoc=self)) else: if os.path.exists(self.basedir): files = os.listdir(self.basedir) files.sort() for afile in files: if not afile.startswith('.'): try: filepath = os.path.join(self.basedir, afile) dummy, dummy, docformat, fileversion = decompose_file_with_version(filepath) checksum = self.md5s.get_checksum(afile) self._docfiles.append(BibDocFile(filepath, self.bibrec_types, fileversion, docformat, self.id, self.status, checksum, self.more_info, human_readable=self.human_readable, bibdoc=self)) except Exception, e: register_exception() raise InvenioBibDocFileError, e if context in ('init', 'init_from_disk'): return else: added_files, deleted_files = make_removed_added_bibdocfiles(previous_file_list) deletedstr = "DELETED" addedstr = "ADDED" if context == 'rename': deletedstr = "RENAMEDFROM" addedstr = "RENAMEDTO" for (docname, docformat, version), (size, checksum, md) in added_files.iteritems(): if context == 'rename': md = '' # No modification time log_action(addedstr, self.id, docname, docformat, version, size, checksum, md) for (docname, docformat, version), (size, checksum, md) in deleted_files.iteritems(): if context == 'rename': md = '' # No modification time log_action(deletedstr, self.id, docname, docformat, version, size, checksum, md) def _sync_to_db(self): """ Update the content of the bibdocfile table by taking what is available on the filesystem. """ self._build_file_list('init_from_disk') run_sql("DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s", (self.id,)) for afile in self.docfiles: run_sql("INSERT INTO bibdocfsinfo(id_bibdoc, version, format, last_version, cd, md, checksum, filesize, mime) VALUES(%s, %s, %s, false, %s, %s, %s, %s, %s)", (self.id, afile.get_version(), afile.get_format(), afile.cd, afile.md, afile.get_checksum(), afile.get_size(), afile.mime)) run_sql("UPDATE bibdocfsinfo SET last_version=true WHERE id_bibdoc=%s AND version=%s", (self.id, self.get_latest_version())) def _build_related_file_list(self): """Lists all files attached to the bibdoc. This function should be called everytime the bibdoc is modified within e.g. its icon. @deprecated: use subformats instead. """ self.related_files = {} res = run_sql("SELECT ln.id_bibdoc2,ln.rel_type,bibdoc.status FROM " "bibdoc_bibdoc AS ln,bibdoc WHERE bibdoc.id=ln.id_bibdoc2 AND " "ln.id_bibdoc1=%s", (str(self.id),)) for row in res: docid = row[0] doctype = row[1] if row[2] != 'DELETED': if not self.related_files.has_key(doctype): self.related_files[doctype] = [] cur_doc = BibDoc.create_instance(docid=docid, human_readable=self.human_readable) self.related_files[doctype].append(cur_doc) def get_total_size_latest_version(self, user_info=None, subformat=None): """Return the total size used on disk of all the files belonging to this bibdoc and corresponding to the latest version. Restricted and hidden files are not counted, unless there is no user_info. @param user_info: the user_info dictionary, used to check restrictions @type: dict @param subformat: if subformat is specified, it limits files only to those from that specific subformat @type subformat: string """ ret = 0 all_files = False # If we are calling this function without user_info, then we want to # see all the files if not user_info: all_files = True for bibdocfile in self.list_latest_files(): # First check for restrictions if all_files or (bibdocfile.is_restricted(user_info)[0] == 0 and not bibdocfile.hidden): # Then check if the format is correct if subformat is None or bibdocfile.get_subformat() == subformat: ret += bibdocfile.get_size() return ret def get_total_size(self): """Return the total size used on disk of all the files belonging to this bibdoc.""" ret = 0 for bibdocfile in self.list_all_files(): ret += bibdocfile.get_size() return ret def list_all_files(self, list_hidden=True): """Returns all the docfiles linked with the given bibdoc.""" if list_hidden: return self.docfiles else: return [afile for afile in self.docfiles if not afile.hidden_p()] def list_latest_files(self, list_hidden=True): """Returns all the docfiles within the last version.""" return self.list_version_files(self.get_latest_version(), list_hidden=list_hidden) def list_version_files(self, version, list_hidden=True): """Return all the docfiles of a particular version.""" version = int(version) return [docfile for docfile in self.docfiles if docfile.get_version() == version and (list_hidden or not docfile.hidden_p())] def get_latest_version(self): """ Returns the latest existing version number for the given bibdoc. If no file is associated to this bibdoc, returns '0'. """ version = 0 for bibdocfile in self.docfiles: if bibdocfile.get_version() > version: version = bibdocfile.get_version() return version def get_file_number(self): """Return the total number of files.""" return len(self.docfiles) def register_download(self, ip_address, version, docformat, userid=0, recid=0): """Register the information about a download of a particular file.""" docformat = normalize_format(docformat) if docformat[:1] == '.': docformat = docformat[1:] docformat = docformat.upper() if not version: version = self.get_latest_version() return run_sql("INSERT INTO rnkDOWNLOADS " "(id_bibrec,id_bibdoc,file_version,file_format," "id_user,client_host,download_time) VALUES " "(%s,%s,%s,%s,%s,INET_ATON(%s),NOW())", (recid, self.id, version, docformat, userid, ip_address,)) def get_incoming_relations(self, rel_type=None): """Return all relations in which this BibDoc appears on target position @param rel_type: Type of the relation, to which we want to limit our search. None = any type @type rel_type: string @return: List of BibRelation instances @rtype: list """ return BibRelation.get_relations(rel_type = rel_type, bibdoc2_id = self.id) def get_outgoing_relations(self, rel_type=None): """Return all relations in which this BibDoc appears on target position @param rel_type: Type of the relation, to which we want to limit our search. None = any type @type rel_type: string @return: List of BibRelation instances @rtype: list """ return BibRelation.get_relations(rel_type = rel_type, bibdoc1_id = self.id) def create_outgoing_relation(self, bibdoc2, rel_type): """ Create an outgoing relation between current BibDoc and a different one """ return BibRelation.create(bibdoc1_id = self.id, bibdoc2_id = bibdoc2.id, rel_type = rel_type) def create_incoming_relation(self, bibdoc1, rel_type): """ Create an outgoing relation between a particular version of current BibDoc and a particular version of a different BibDoc """ return BibRelation.create(bibdoc1_id = bibdoc1.id, bibdoc2_id = self.id, rel_type = rel_type) def generic_path2bidocfile(fullpath): """ Returns a BibDocFile objects that wraps the given fullpath. @note: the object will contain the minimum information that can be guessed from the fullpath (e.g. docname, format, subformat, version, md5, creation_date, modification_date). It won't contain for example a comment, a description, a doctype, a restriction. """ fullpath = os.path.abspath(fullpath) try: path, name, docformat, version = decompose_file_with_version(fullpath) except ValueError: ## There is no version version = 0 path, name, docformat = decompose_file(fullpath) md5folder = Md5Folder(path) checksum = md5folder.get_checksum(os.path.basename(fullpath)) return BibDocFile(fullpath=fullpath, recid_doctypes=[(0, None, name)], version=version, docformat=docformat, docid=0, status=None, checksum=checksum, more_info=None) class BibDocFile(object): """This class represents a physical file in the Invenio filesystem. It should never be instantiated directly""" def __init__(self, fullpath, recid_doctypes, version, docformat, docid, status, checksum, more_info=None, human_readable=False, cd=None, md=None, size=None, bibdoc=None): self.fullpath = os.path.abspath(fullpath) self.docid = docid self.recids_doctypes = recid_doctypes self.version = version self.status = status self.checksum = checksum self.human_readable = human_readable self.name = recid_doctypes[0][2] if bibdoc is not None: self.__bibdoc = ref(bibdoc) else: self.__bibdoc = None if more_info: self.description = more_info.get_description(docformat, version) self.comment = more_info.get_comment(docformat, version) self.flags = more_info.get_flags(docformat, version) else: self.description = None self.comment = None self.flags = [] self.format = normalize_format(docformat) self.superformat = get_superformat_from_format(self.format) self.subformat = get_subformat_from_format(self.format) if docformat: self.recids_doctypes = [(a,b,c+self.superformat) for (a,b,c) in self.recids_doctypes] self.mime, self.encoding = _mimes.guess_type(self.recids_doctypes[0][2]) if self.mime is None: self.mime = "application/octet-stream" self.more_info = more_info self.hidden = 'HIDDEN' in self.flags self.size = size or os.path.getsize(fullpath) self.md = md or datetime.fromtimestamp(os.path.getmtime(fullpath)) try: self.cd = cd or datetime.fromtimestamp(os.path.getctime(fullpath)) except OSError: self.cd = self.md self.dir = os.path.dirname(fullpath) # make filename url safe url_safe_filename = urllib.quote(self.name) if self.subformat: self.url = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], url_safe_filename, self.superformat), {'subformat' : self.subformat}) self.fullurl = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], url_safe_filename, self.superformat), {'subformat' : self.subformat, 'version' : self.version}) else: self.url = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], url_safe_filename, self.superformat), {}) self.fullurl = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], url_safe_filename, self.superformat), {'version' : self.version}) self.etag = '"%i%s%i"' % (self.docid, self.format, self.version) self.magic = None @property def bibdoc(self): """ Wrapper around the referenced bibdoc necesseary to avoid memory leaks. """ if self.__bibdoc is None or self.__bibdoc() is None: bibdoc = BibDoc(self.docid) self.__bibdoc = ref(bibdoc) return bibdoc return self.__bibdoc() def __getstate__(self): """Remove weakref so the object can be pickled.""" dict_ = copy.copy(self.__dict__) dict_['_BibDocFile__bibdoc'] = self.bibdoc return dict_ def __setstate__(self, data_dict): """Undo what `__getstate__` did setting back the weakref. :param data_dict: `dict` from `__getstate__` """ for (name, value) in data_dict.iteritems(): setattr(self, name, value) if self.__bibdoc is not None: self.__bibdoc = ref(self.__bibdoc) def __repr__(self): return ('BibDocFile(%s, %i, %s, %s, %i, %i, %s, %s, %s, %s)' % (repr(self.fullpath), self.version, repr(self.name), repr(self.format), self.recids_doctypes[0][0], self.docid, repr(self.status), repr(self.checksum), repr(self.more_info), repr(self.human_readable))) def format_recids(self): if self.bibdoc: return self.bibdoc.format_recids() return "0" def __str__(self): recids = self.format_recids() out = '%s:%s:%s:%s:fullpath=%s\n' % (recids, self.docid, self.version, self.format, self.fullpath) out += '%s:%s:%s:%s:name=%s\n' % (recids, self.docid, self.version, self.format, self.name) out += '%s:%s:%s:%s:subformat=%s\n' % (recids, self.docid, self.version, self.format, get_subformat_from_format(self.format)) out += '%s:%s:%s:%s:status=%s\n' % (recids, self.docid, self.version, self.format, self.status) out += '%s:%s:%s:%s:checksum=%s\n' % (recids, self.docid, self.version, self.format, self.checksum) if self.human_readable: out += '%s:%s:%s:%s:size=%s\n' % (recids, self.docid, self.version, self.format, nice_size(self.size)) else: out += '%s:%s:%s:%s:size=%s\n' % (recids, self.docid, self.version, self.format, self.size) out += '%s:%s:%s:%s:creation time=%s\n' % (recids, self.docid, self.version, self.format, self.cd) out += '%s:%s:%s:%s:modification time=%s\n' % (recids, self.docid, self.version, self.format, self.md) out += '%s:%s:%s:%s:magic=%s\n' % (recids, self.docid, self.version, self.format, self.get_magic()) out += '%s:%s:%s:%s:mime=%s\n' % (recids, self.docid, self.version, self.format, self.mime) out += '%s:%s:%s:%s:encoding=%s\n' % (recids, self.docid, self.version, self.format, self.encoding) out += '%s:%s:%s:%s:url=%s\n' % (recids, self.docid, self.version, self.format, self.url) out += '%s:%s:%s:%s:fullurl=%s\n' % (recids, self.docid, self.version, self.format, self.fullurl) out += '%s:%s:%s:%s:description=%s\n' % (recids, self.docid, self.version, self.format, self.description) out += '%s:%s:%s:%s:comment=%s\n' % (recids, self.docid, self.version, self.format, self.comment) out += '%s:%s:%s:%s:hidden=%s\n' % (recids, self.docid, self.version, self.format, self.hidden) out += '%s:%s:%s:%s:flags=%s\n' % (recids, self.docid, self.version, self.format, self.flags) out += '%s:%s:%s:%s:etag=%s\n' % (recids, self.docid, self.version, self.format, self.etag) return out def is_restricted(self, user_info): """Returns restriction state. (see acc_authorize_action return values)""" if self.status not in ('', 'DELETED'): return check_bibdoc_authorization(user_info, status=self.status) elif self.status == 'DELETED': return (1, 'File has ben deleted') else: return (0, '') def is_icon(self, subformat_re=CFG_BIBDOCFILE_ICON_SUBFORMAT_RE): """ @param subformat_re: by default the convention is that L{CFG_BIBDOCFILE_ICON_SUBFORMAT_RE} is used as a subformat indicator to mean that a particular format is to be used as an icon. Specifiy a different subformat if you need to use a different convention. @type subformat: compiled regular expression @return: True if this file is an icon. @rtype: bool """ return bool(subformat_re.match(self.subformat)) def hidden_p(self): return self.hidden def get_url(self): return self.url def get_type(self): """Returns the first type connected with the bibdoc of this file.""" return self.recids_doctypes[0][1] def get_path(self): return self.fullpath def get_bibdocid(self): return self.docid def get_name(self): return self.name def get_full_name(self): """Returns the first name connected with the bibdoc of this file.""" return self.recids_doctypes[0][2] def get_full_path(self): return self.fullpath def get_format(self): return self.format def get_subformat(self): return self.subformat def get_superformat(self): return self.superformat def get_size(self): return self.size def get_version(self): return self.version def get_checksum(self): return self.checksum def get_description(self): return self.description def get_comment(self): return self.comment def get_content(self): """Returns the binary content of the file.""" content_fd = open(self.fullpath, 'rb') content = content_fd.read() content_fd.close() return content def get_recid(self): """Returns the first recid connected with the bibdoc of this file.""" return self.recids_doctypes[0][0] def get_status(self): """Returns the status of the file, i.e. either '', 'DELETED' or a restriction keyword.""" return self.status def get_magic(self): """Return all the possible guesses from the magic library about the content of the file.""" if self.magic is None: if CFG_HAS_MAGIC == 1: magic_cookies = _get_magic_cookies() magic_result = [] for key in magic_cookies.keys(): magic_result.append(magic_cookies[key].file(self.fullpath)) self.magic = tuple(magic_result) elif CFG_HAS_MAGIC == 2: magic_result = [] for key in ({'mime': False, 'mime_encoding': False}, {'mime': True, 'mime_encoding': False}, {'mime': False, 'mime_encoding': True}): magic_result.append(_magic_wrapper(self.fullpath, **key)) self.magic = tuple(magic_result) return self.magic def check(self): """Return True if the checksum corresponds to the file.""" return calculate_md5(self.fullpath) == self.checksum def stream(self, req, download=False): """Stream the file. Note that no restriction check is being done here, since restrictions have been checked previously inside websubmit_webinterface.py.""" if os.path.exists(self.fullpath): if random.random() < CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY and calculate_md5(self.fullpath) != self.checksum: raise InvenioBibDocFileError, "File %s, version %i, is corrupted!" % (self.recids_doctypes[0][2], self.version) stream_file(req, self.fullpath, "%s%s" % (self.name, self.superformat), self.mime, self.encoding, self.etag, self.checksum, self.fullurl, download=download) raise apache.SERVER_RETURN, apache.DONE else: req.status = apache.HTTP_NOT_FOUND raise InvenioBibDocFileError, "%s does not exists!" % self.fullpath _RE_STATUS_PARSER = re.compile(r'^(?Pemail|group|egroup|role|firerole|status):\s*(?P.*)$', re.S + re.I) def check_bibdoc_authorization(user_info, status): """ Check if the user is authorized to access a document protected with the given status. L{status} is a string of the form:: auth_type: auth_value where C{auth_type} can have values in:: email, group, role, firerole, status and C{auth_value} has a value interpreted againsta C{auth_type}: - C{email}: the user can access the document if his/her email matches C{auth_value} - C{group}: the user can access the document if one of the groups (local or external) of which he/she is member matches C{auth_value} - C{role}: the user can access the document if he/she belongs to the WebAccess role specified in C{auth_value} - C{firerole}: the user can access the document if he/she is implicitly matched by the role described by the firewall like role definition in C{auth_value} - C{status}: the user can access the document if he/she is authorized to for the action C{viewrestrdoc} with C{status} paramter having value C{auth_value} @note: If no C{auth_type} is specified or if C{auth_type} is not one of the above, C{auth_value} will be set to the value contained in the parameter C{status}, and C{auth_type} will be considered to be C{status}. @param user_info: the user_info dictionary @type: dict @param status: the status of the document. @type status: string @return: a tuple, of the form C{(auth_code, auth_message)} where auth_code is 0 if the authorization is granted and greater than 0 otherwise. @rtype: (int, string) @raise ValueError: in case of unexpected parsing error. """ if not status: return (0, CFG_WEBACCESS_WARNING_MSGS[0]) def parse_status(status): g = _RE_STATUS_PARSER.match(status) if g: return (g.group('type').lower(), g.group('value')) else: return ('status', status) if acc_is_user_in_role(user_info, acc_get_role_id(SUPERADMINROLE)): return (0, CFG_WEBACCESS_WARNING_MSGS[0]) auth_type, auth_value = parse_status(status) if auth_type == 'status': return acc_authorize_action(user_info, 'viewrestrdoc', status=auth_value) elif auth_type == 'email': if not auth_value.lower().strip() == user_info['email'].lower().strip(): return (1, 'You must be member of the group %s in order to access this document' % repr(auth_value)) elif auth_type == 'group': if not auth_value in user_info['group']: return (1, 'You must be member of the group %s in order to access this document' % repr(auth_value)) elif auth_type == 'role': if not acc_is_user_in_role(user_info, acc_get_role_id(auth_value)): return (1, 'You must be member in the role %s in order to access this document' % repr(auth_value)) elif auth_type == 'firerole': if not acc_firerole_check_user(user_info, compile_role_definition(auth_value)): return (1, 'You must be authorized in order to access this document') else: raise ValueError, 'Unexpected authorization type %s for %s' % (repr(auth_type), repr(auth_value)) return (0, CFG_WEBACCESS_WARNING_MSGS[0]) _RE_BAD_MSIE = re.compile("MSIE\s+(\d+\.\d+)") def stream_file(req, fullpath, fullname=None, mime=None, encoding=None, etag=None, md5str=None, location=None, download=False): """This is a generic function to stream a file to the user. If fullname, mime, encoding, and location are not provided they will be guessed based on req and fullpath. md5str should be passed as an hexadecimal string. """ def normal_streaming(size): req.set_content_length(size) req.send_http_header() if not req.header_only: req.sendfile(fullpath) return "" def single_range(size, the_range): req.set_content_length(the_range[1]) req.headers_out['Content-Range'] = 'bytes %d-%d/%d' % (the_range[0], the_range[0] + the_range[1] - 1, size) req.status = apache.HTTP_PARTIAL_CONTENT req.send_http_header() if not req.header_only: req.sendfile(fullpath, the_range[0], the_range[1]) return "" def multiple_ranges(size, ranges, mime): req.status = apache.HTTP_PARTIAL_CONTENT boundary = '%s%04d' % (time.strftime('THIS_STRING_SEPARATES_%Y%m%d%H%M%S'), random.randint(0, 9999)) req.content_type = 'multipart/byteranges; boundary=%s' % boundary content_length = 0 for arange in ranges: content_length += len('--%s\r\n' % boundary) content_length += len('Content-Type: %s\r\n' % mime) content_length += len('Content-Range: bytes %d-%d/%d\r\n' % (arange[0], arange[0] + arange[1] - 1, size)) content_length += len('\r\n') content_length += arange[1] content_length += len('\r\n') content_length += len('--%s--\r\n' % boundary) req.set_content_length(content_length) req.send_http_header() if not req.header_only: for arange in ranges: req.write('--%s\r\n' % boundary, 0) req.write('Content-Type: %s\r\n' % mime, 0) req.write('Content-Range: bytes %d-%d/%d\r\n' % (arange[0], arange[0] + arange[1] - 1, size), 0) req.write('\r\n', 0) req.sendfile(fullpath, arange[0], arange[1]) req.write('\r\n', 0) req.write('--%s--\r\n' % boundary) req.flush() return "" def parse_date(date): """According to a date can come in three formats (in order of preference): Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123 Sunday, 06-Nov-94 08:49:37 GMT ; RFC 850, obsoleted by RFC 1036 Sun Nov 6 08:49:37 1994 ; ANSI C's asctime() format Moreover IE is adding some trailing information after a ';'. Wrong dates should be simpled ignored. This function return the time in seconds since the epoch GMT or None in case of errors.""" if not date: return None try: date = date.split(';')[0].strip() # Because of IE ## Sun, 06 Nov 1994 08:49:37 GMT return time.mktime(time.strptime(date, '%a, %d %b %Y %X %Z')) except: try: ## Sun, 06 Nov 1994 08:49:37 GMT return time.mktime(time.strptime(date, '%A, %d-%b-%y %H:%M:%S %Z')) except: try: ## Sun, 06 Nov 1994 08:49:37 GMT return time.mktime(date) except: return None def parse_ranges(ranges): """According to a (multiple) range request comes in the form: bytes=20-30,40-60,70-,-80 with the meaning: from byte to 20 to 30 inclusive (11 bytes) from byte to 40 to 60 inclusive (21 bytes) from byte 70 to (size - 1) inclusive (size - 70 bytes) from byte size - 80 to (size - 1) inclusive (80 bytes) This function will return the list of ranges in the form: [[first_byte, last_byte], ...] If first_byte or last_byte aren't specified they'll be set to None If the list is not well formatted it will return None """ try: if ranges.startswith('bytes') and '=' in ranges: ranges = ranges.split('=')[1].strip() else: return None ret = [] for arange in ranges.split(','): arange = arange.strip() if arange.startswith('-'): ret.append([None, int(arange[1:])]) elif arange.endswith('-'): ret.append([int(arange[:-1]), None]) else: ret.append(map(int, arange.split('-'))) return ret except: return None def parse_tags(tags): """Return a list of tags starting from a comma separated list.""" return [tag.strip() for tag in tags.split(',')] def fix_ranges(ranges, size): """Complementary to parse_ranges it will transform all the ranges into (first_byte, length), adjusting all the value based on the actual size provided. """ ret = [] for arange in ranges: if (arange[0] is None and arange[1] > 0) or arange[0] < size: if arange[0] is None: arange[0] = size - arange[1] elif arange[1] is None: arange[1] = size - arange[0] else: arange[1] = arange[1] - arange[0] + 1 arange[0] = max(0, arange[0]) arange[1] = min(size - arange[0], arange[1]) if arange[1] > 0: ret.append(arange) return ret def get_normalized_headers(): """Strip and lowerize all the keys of the headers dictionary plus strip, lowerize and transform known headers value into their value.""" ret = { 'if-match' : None, 'unless-modified-since' : None, 'if-modified-since' : None, 'range' : None, 'if-range' : None, 'if-none-match' : None, } for key, value in req.headers_in.iteritems(): key = key.strip().lower() value = value.strip() if key in ('unless-modified-since', 'if-modified-since'): value = parse_date(value) elif key == 'range': value = parse_ranges(value) elif key == 'if-range': value = parse_date(value) or parse_tags(value) elif key in ('if-match', 'if-none-match'): value = parse_tags(value) if value: ret[key] = value return ret headers = get_normalized_headers() g = _RE_BAD_MSIE.search(headers.get('user-agent', "MSIE 6.0")) bad_msie = g and float(g.group(1)) < 9.0 if CFG_BIBDOCFILE_USE_XSENDFILE: ## If XSendFile is supported by the server, let's use it. if os.path.exists(fullpath): if fullname is None: fullname = os.path.basename(fullpath) if bad_msie: ## IE is confused by quotes req.headers_out["Content-Disposition"] = 'attachment; filename=%s' % fullname.replace('"', '\\"') elif download: req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % fullname.replace('"', '\\"') else: ## IE is confused by inline req.headers_out["Content-Disposition"] = 'inline; filename="%s"' % fullname.replace('"', '\\"') req.headers_out["X-Sendfile"] = fullpath if mime is None: (mime, encoding) = _mimes.guess_type(fullpath) if mime is None: mime = "application/octet-stream" if not bad_msie: ## IE is confused by not supported mimetypes req.content_type = mime return "" else: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND if headers['if-match']: if etag is not None and etag not in headers['if-match']: raise apache.SERVER_RETURN, apache.HTTP_PRECONDITION_FAILED if os.path.exists(fullpath): mtime = os.path.getmtime(fullpath) if fullname is None: fullname = os.path.basename(fullpath) if mime is None: (mime, encoding) = _mimes.guess_type(fullpath) if mime is None: mime = "application/octet-stream" if location is None: location = req.uri if not bad_msie: ## IE is confused by not supported mimetypes req.content_type = mime req.encoding = encoding req.filename = fullname req.headers_out["Last-Modified"] = time.strftime('%a, %d %b %Y %X GMT', time.gmtime(mtime)) if CFG_ENABLE_HTTP_RANGE_REQUESTS: req.headers_out["Accept-Ranges"] = "bytes" else: req.headers_out["Accept-Ranges"] = "none" req.headers_out["Content-Location"] = location if etag is not None: req.headers_out["ETag"] = etag if md5str is not None: req.headers_out["Content-MD5"] = base64.encodestring(binascii.unhexlify(md5str.upper()))[:-1] if bad_msie: ## IE is confused by quotes req.headers_out["Content-Disposition"] = 'attachment; filename=%s' % fullname.replace('"', '\\"') elif download: req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % fullname.replace('"', '\\"') else: ## IE is confused by inline req.headers_out["Content-Disposition"] = 'inline; filename="%s"' % fullname.replace('"', '\\"') size = os.path.getsize(fullpath) if not size: try: raise Exception, '%s exists but is empty' % fullpath except Exception: register_exception(req=req, alert_admin=True) raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND if headers['if-modified-since'] and headers['if-modified-since'] >= mtime: raise apache.SERVER_RETURN, apache.HTTP_NOT_MODIFIED if headers['if-none-match']: if etag is not None and etag in headers['if-none-match']: raise apache.SERVER_RETURN, apache.HTTP_NOT_MODIFIED if headers['unless-modified-since'] and headers['unless-modified-since'] < mtime: return normal_streaming(size) if CFG_ENABLE_HTTP_RANGE_REQUESTS and headers['range']: try: if headers['if-range']: if etag is None or etag not in headers['if-range']: return normal_streaming(size) ranges = fix_ranges(headers['range'], size) except: return normal_streaming(size) if len(ranges) > 1: return multiple_ranges(size, ranges, mime) elif ranges: return single_range(size, ranges[0]) else: raise apache.SERVER_RETURN, apache.HTTP_RANGE_NOT_SATISFIABLE else: return normal_streaming(size) else: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND def stream_restricted_icon(req): """Return the content of the "Restricted Icon" file.""" stream_file(req, '%s/img/restricted.gif' % CFG_WEBDIR) raise apache.SERVER_RETURN, apache.DONE #def list_versions_from_array(docfiles): # """Retrieve the list of existing versions from the given docfiles list.""" # versions = [] # for docfile in docfiles: # if not docfile.get_version() in versions: # versions.append(docfile.get_version()) # versions.sort() # versions.reverse() # return versions def _make_base_dir(docid): """Given a docid it returns the complete path that should host its files.""" group = "g" + str(int(int(docid) / CFG_BIBDOCFILE_FILESYSTEM_BIBDOC_GROUP_LIMIT)) return os.path.join(CFG_BIBDOCFILE_FILEDIR, group, str(docid)) class Md5Folder(object): """Manage all the Md5 checksum about a folder""" def __init__(self, folder): """Initialize the class from the md5 checksum of a given path""" self.folder = folder self.load() def update(self, only_new=True): """Update the .md5 file with the current files. If only_new is specified then only not already calculated file are calculated.""" if not only_new: self.md5s = {} if os.path.exists(self.folder): for filename in os.listdir(self.folder): if filename not in self.md5s and not filename.startswith('.'): self.md5s[filename] = calculate_md5(os.path.join(self.folder, filename)) self.store() def store(self): """Store the current md5 dictionary into .md5""" try: old_umask = os.umask(022) md5file = open(os.path.join(self.folder, ".md5"), "w") for key, value in self.md5s.items(): md5file.write('%s *%s\n' % (value, key)) md5file.close() os.umask(old_umask) except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError("Encountered an exception while storing .md5 for folder '%s': '%s'" % (self.folder, e)) def load(self): """Load .md5 into the md5 dictionary""" self.md5s = {} md5_path = os.path.join(self.folder, ".md5") if os.path.exists(md5_path): for row in open(md5_path, "r"): md5hash = row[:32] filename = row[34:].strip() self.md5s[filename] = md5hash else: self.update() def check(self, filename=''): """Check the specified file or all the files for which it exists a hash for being coherent with the stored hash.""" if filename and filename in self.md5s.keys(): try: return self.md5s[filename] == calculate_md5(os.path.join(self.folder, filename)) except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError("Encountered an exception while loading '%s': '%s'" % (os.path.join(self.folder, filename), e)) else: for filename, md5hash in self.md5s.items(): try: if calculate_md5(os.path.join(self.folder, filename)) != md5hash: return False except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError("Encountered an exception while loading '%s': '%s'" % (os.path.join(self.folder, filename), e)) return True def get_checksum(self, filename): """Return the checksum of a physical file.""" md5hash = self.md5s.get(filename, None) if md5hash is None: self.update() # Now it should not fail! md5hash = self.md5s[filename] return md5hash def calculate_md5_external(filename): """Calculate the md5 of a physical file through md5sum Command Line Tool. This is suitable for file larger than 256Kb.""" try: md5_result = os.popen(CFG_PATH_MD5SUM + ' -b %s' % escape_shell_arg(filename)) ret = md5_result.read()[:32] md5_result.close() if len(ret) != 32: # Error in running md5sum. Let's fallback to internal # algorithm. return calculate_md5(filename, force_internal=True) else: return ret except Exception, e: raise InvenioBibDocFileError("Encountered an exception while calculating md5 for file '%s': '%s'" % (filename, e)) def calculate_md5(filename, force_internal=False): """Calculate the md5 of a physical file. This is suitable for files smaller than 256Kb.""" if not CFG_PATH_MD5SUM or force_internal or os.path.getsize(filename) < CFG_BIBDOCFILE_MD5_THRESHOLD: try: to_be_read = open(filename, "rb") computed_md5 = md5() while True: buf = to_be_read.read(CFG_BIBDOCFILE_MD5_BUFFER) if buf: computed_md5.update(buf) else: break to_be_read.close() return computed_md5.hexdigest() except Exception, e: register_exception(alert_admin=True) raise InvenioBibDocFileError("Encountered an exception while calculating md5 for file '%s': '%s'" % (filename, e)) else: return calculate_md5_external(filename) def bibdocfile_url_to_bibrecdocs(url): """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns a BibRecDocs object for the corresponding recid.""" recid = decompose_bibdocfile_url(url)[0] return BibRecDocs(recid) def bibdocfile_url_to_bibdoc(url): """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns a BibDoc object for the corresponding recid/docname.""" docname = decompose_bibdocfile_url(url)[1] return bibdocfile_url_to_bibrecdocs(url).get_bibdoc(docname) def bibdocfile_url_to_bibdocfile(url): """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns a BibDocFile object for the corresponding recid/docname/format.""" docformat = decompose_bibdocfile_url(url)[2] return bibdocfile_url_to_bibdoc(url).get_file(docformat) def bibdocfile_url_to_fullpath(url): """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns the fullpath for the corresponding recid/docname/format.""" return bibdocfile_url_to_bibdocfile(url).get_full_path() def bibdocfile_url_p(url): """Return True when the url is a potential valid url pointing to a fulltext owned by a system.""" if url.startswith('%s/getfile.py' % CFG_SITE_URL) or url.startswith('%s/getfile.py' % CFG_SITE_SECURE_URL): return True if not (url.startswith('%s/%s/' % (CFG_SITE_URL, CFG_SITE_RECORD)) or url.startswith('%s/%s/' % (CFG_SITE_SECURE_URL, CFG_SITE_RECORD))): return False splitted_url = url.split('/files/') return len(splitted_url) == 2 and splitted_url[0] != '' and splitted_url[1] != '' def get_docid_from_bibdocfile_fullpath(fullpath): """Given a bibdocfile fullpath (e.g. "CFG_BIBDOCFILE_FILEDIR/g0/123/bar.pdf;1") returns the docid (e.g. 123).""" if not fullpath.startswith(os.path.join(CFG_BIBDOCFILE_FILEDIR, 'g')): raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath dirname = decompose_file_with_version(fullpath)[0] try: return int(dirname.split('/')[-1]) except: raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath def decompose_bibdocfile_fullpath(fullpath): """Given a bibdocfile fullpath (e.g. "CFG_BIBDOCFILE_FILEDIR/g0/123/bar.pdf;1") returns a quadruple (recid, docname, format, version).""" if not fullpath.startswith(os.path.join(CFG_BIBDOCFILE_FILEDIR, 'g')): raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath dirname, dummy, extension, version = decompose_file_with_version(fullpath) try: docid = int(dirname.split('/')[-1]) return {"doc_id" : docid, "extension": extension, "version": version} except: raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath _RE_BIBDOCFILE_URL = re.compile("/%s/(?P\d+)/files/(?P.*)" % (re.escape(CFG_SITE_RECORD), )) def decompose_bibdocfile_url(url): """Given a bibdocfile_url return a triple (recid, docname, format).""" if url.startswith('%s/getfile.py' % CFG_SITE_URL) or url.startswith('%s/getfile.py' % CFG_SITE_SECURE_URL): return decompose_bibdocfile_very_old_url(url) scheme, netloc, path, query, dummy_fragment = urlsplit(url) if "%s://%s" % (scheme, netloc) not in (CFG_SITE_URL, CFG_SITE_SECURE_URL): raise InvenioBibDocFileError("URL %s doesn't correspond to a valid BibDocFile URL." % url) g = _RE_BIBDOCFILE_URL.match(urllib.unquote(path)) if g: recid = int(g.group('recid')) rest = g.group('rest') dummy, docname, docformat = decompose_file(rest) query = parse_qs(query) if 'subformat' in query: docformat += ";%s" % query['subformat'][0] return recid, docname, docformat else: raise InvenioBibDocFileError, "Url %s doesn't correspond to a valid record inside the system." % url re_bibdocfile_old_url = re.compile(r'/%s/(\d*)/files/' % CFG_SITE_RECORD) def decompose_bibdocfile_old_url(url): """Given a bibdocfile old url (e.g. CFG_SITE_URL/CFG_SITE_RECORD/123/files) it returns the recid.""" g = re_bibdocfile_old_url.search(url) if g: return int(g.group(1)) raise InvenioBibDocFileError('%s is not a valid old bibdocfile url' % url) def decompose_bibdocfile_very_old_url(url): """Decompose an old /getfile.py? URL""" if url.startswith('%s/getfile.py' % CFG_SITE_URL) or url.startswith('%s/getfile.py' % CFG_SITE_SECURE_URL): params = urllib.splitquery(url)[1] if params: try: params = parse_qs(params) if 'docid' in params: docid = int(params['docid'][0]) bibdoc = BibDoc.create_instance(docid) if bibdoc.bibrec_links: recid = bibdoc.bibrec_links[0]["rec_id"] docname = bibdoc.bibrec_links[0]["doc_name"] else: raise InvenioBibDocFileError("Old style URL pointing to an unattached document") elif 'recid' in params: recid = int(params['recid'][0]) if 'name' in params: docname = params['name'][0] else: docname = '' else: raise InvenioBibDocFileError('%s has not enough params to correspond to a bibdocfile.' % url) docformat = normalize_format(params.get('format', [''])[0]) return (recid, docname, docformat) except Exception, e: raise InvenioBibDocFileError('Problem with %s: %s' % (url, e)) else: raise InvenioBibDocFileError('%s has no params to correspond to a bibdocfile.' % url) else: raise InvenioBibDocFileError('%s is not a valid very old bibdocfile url' % url) def get_docname_from_url(url): """Return a potential docname given a url""" path = urlsplit(urllib.unquote(url))[2] filename = os.path.split(path)[-1] return file_strip_ext(filename) def get_format_from_url(url): """Return a potential format given a url""" path = urlsplit(urllib.unquote(url))[2] filename = os.path.split(path)[-1] return filename[len(file_strip_ext(filename)):] def clean_url(url): """Given a local url e.g. a local path it render it a realpath.""" if is_url_a_local_file(url): path = urlsplit(urllib.unquote(url))[2] return os.path.abspath(path) else: return url def is_url_a_local_file(url): """Return True if the given URL is pointing to a local file.""" protocol = urlsplit(url)[0] return protocol in ('', 'file') def check_valid_url(url): """ Check for validity of a url or a file. @param url: the URL to check @type url: string @raise StandardError: if the URL is not a valid URL. """ try: if is_url_a_local_file(url): path = urlsplit(urllib.unquote(url))[2] if os.path.abspath(path) != path: raise StandardError, "%s is not a normalized path (would be %s)." % (path, os.path.normpath(path)) for allowed_path in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS + [CFG_TMPDIR, CFG_TMPSHAREDDIR, CFG_WEBSUBMIT_STORAGEDIR]: if path.startswith(allowed_path): dummy_fd = open(path) dummy_fd.close() return raise StandardError, "%s is not in one of the allowed paths." % path else: try: open_url(url) except InvenioBibdocfileUnauthorizedURL, e: raise StandardError, str(e) except Exception, e: raise StandardError, "%s is not a correct url: %s" % (url, e) def safe_mkstemp(suffix, prefix='bibdocfile_'): """Create a temporary filename that don't have any '.' inside a part from the suffix.""" tmpfd, tmppath = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=CFG_TMPDIR) # Close the file and leave the responsability to the client code to # correctly open/close it. os.close(tmpfd) if '.' not in suffix: # Just in case format is empty return tmppath while '.' in os.path.basename(tmppath)[:-len(suffix)]: os.remove(tmppath) tmpfd, tmppath = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=CFG_TMPDIR) os.close(tmpfd) return tmppath def download_local_file(filename, docformat=None): """ Copies a local file to Invenio's temporary directory. @param filename: the name of the file to copy @type filename: string @param format: the format of the file to copy (will be found if not specified) @type format: string @return: the path of the temporary file created @rtype: string @raise StandardError: if something went wrong """ # Make sure the format is OK. if docformat is None: docformat = guess_format_from_url(filename) else: docformat = normalize_format(docformat) tmppath = '' # Now try to copy. try: path = urlsplit(urllib.unquote(filename))[2] if os.path.abspath(path) != path: raise StandardError, "%s is not a normalized path (would be %s)." \ % (path, os.path.normpath(path)) for allowed_path in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS + [CFG_TMPDIR, CFG_WEBSUBMIT_STORAGEDIR]: if path.startswith(allowed_path): tmppath = safe_mkstemp(docformat) shutil.copy(path, tmppath) if os.path.getsize(tmppath) == 0: os.remove(tmppath) raise StandardError, "%s seems to be empty" % filename break else: raise StandardError, "%s is not in one of the allowed paths." % path except Exception, e: raise StandardError, "Impossible to copy the local file '%s': %s" % \ (filename, str(e)) return tmppath def download_external_url(url, docformat=None): """ Download a url (if it corresponds to a remote file) and return a local url to it. @param url: the URL to download @type url: string @param format: the format of the file (will be found if not specified) @type format: string @return: the path to the download local file @rtype: string @raise StandardError: if the download failed """ tmppath = None # Make sure the format is OK. if docformat is None: # First try to find a known extension to the URL docformat = decompose_file(url, skip_version=True, only_known_extensions=True)[2] if not docformat: # No correct format could be found. Will try to get it from the # HTTP message headers. docformat = '' else: docformat = normalize_format(docformat) from_file, to_file, tmppath = None, None, '' try: from_file = open_url(url) except InvenioBibdocfileUnauthorizedURL, e: raise StandardError, str(e) except urllib2.URLError, e: raise StandardError, 'URL could not be opened: %s' % str(e) if not docformat: # We could not determine the format from the URL, so let's try # to read it from the HTTP headers. docformat = get_format_from_http_response(from_file) try: tmppath = safe_mkstemp(docformat) to_file = open(tmppath, 'w') while True: block = from_file.read(CFG_BIBDOCFILE_BLOCK_SIZE) if not block: break to_file.write(block) to_file.close() from_file.close() if os.path.getsize(tmppath) == 0: raise StandardError, "%s seems to be empty" % url except Exception, e: # Try to close and remove the temporary file. try: to_file.close() except Exception: pass try: os.remove(tmppath) except Exception: pass raise StandardError, "Error when downloading %s into %s: %s" % \ (url, tmppath, e) return tmppath def get_format_from_http_response(response): """ Tries to retrieve the format of the file from the message headers of the HTTP response. @param response: the HTTP response @type response: file-like object (as returned by urllib.urlopen) @return: the format of the remote resource @rtype: string """ def parse_content_type(text): return text.split(';')[0].strip() def parse_content_disposition(text): for item in text.split(';'): item = item.strip() if item.strip().startswith('filename='): return item[len('filename="'):-len('"')] info = response.info() docformat = '' content_disposition = info.getheader('Content-Disposition') if content_disposition: filename = parse_content_disposition(content_disposition) if filename: docformat = decompose_file(filename, only_known_extensions=False)[2] if docformat: return docformat content_type = info.getheader('Content-Type') if content_type: content_type = parse_content_type(content_type) if content_type not in ('text/plain', 'application/octet-stream'): ## We actually ignore these mimetypes since they are the ## defaults often returned by Apache in case the mimetype ## was not known if content_type in CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING: docformat = normalize_format(CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING[content_type]) else: ext = _mimes.guess_extension(content_type) if ext: docformat = normalize_format(ext) return docformat def download_url(url, docformat=None): """ Download a url (if it corresponds to a remote file) and return a local url to it. """ tmppath = None try: if is_url_a_local_file(url): tmppath = download_local_file(url, docformat = docformat) else: tmppath = download_external_url(url, docformat = docformat) except StandardError: raise return tmppath class MoreInfo(object): """This class represents a genering MoreInfo dictionary. MoreInfo object can be attached to bibdoc, bibversion, format or BibRelation. The entity where a particular MoreInfo object is attached has to be specified using the constructor parametes. This class is a thin wrapper around the database table. """ def __init__(self, docid = None, version = None, docformat = None, relation = None, cache_only = False, cache_reads = True, initial_data = None): """ @param cache_only Determines if MoreInfo object should be created in memory only or reflected in the database @type cache_only boolean @param cache_reads Determines if reads should be executed on the in-memory cache or should be redirected to the database. If this is true, cache can be entirely regenerated from the database only upon an explicit request. If the value is not present in the cache, the database is queried @type cache_reads boolean @param initial_data Allows to specify initial content of the cache. This parameter is useful when we create an in-memory instance from serialised value @type initial_data string """ self.docid = docid self.version = version self.format = docformat self.relation = relation self.cache_only = cache_only if initial_data != None: self.cache = initial_data self.dirty = initial_data if not self.cache_only: self._flush_cache() #inserts new entries else: self.cache = {} self.dirty = {} self.cache_reads = cache_reads if not self.cache_only: self.populate_from_database() @staticmethod def create_from_serialised(ser_str, docid = None, version = None, docformat = None, relation = None, cache_only = False, cache_reads = True): """Creates an instance of MoreInfo using serialised data as the cache content""" data = cPickle.loads(base64.b64decode(ser_str)) return MoreInfo(docid = docid, version = version, docformat = docformat, relation = relation, cache_only = cache_only, cache_reads = cache_reads, initial_data = data); def serialise_cache(self): """Returns a serialised representation of the cache""" return base64.b64encode(cPickle.dumps(self.get_cache())) def populate_from_database(self): """Retrieves all values of MoreInfo and places them in the cache""" where_str, where_args = self._generate_where_query_args() query_str = "SELECT namespace, data_key, data_value FROM bibdocmoreinfo WHERE %s" % (where_str, ) res = run_sql(query_str, where_args) if res: for row in res: namespace, data_key, data_value_ser = row data_value = cPickle.loads(data_value_ser) if not namespace in self.cache: self.cache[namespace] = {} self.cache[namespace][data_key] = data_value def _mark_dirty(self, namespace, data_key): """Marks a data key dirty - that should be saved into the database""" if not namespace in self.dirty: self.dirty[namespace] = {} self.dirty[namespace][data_key] = True def _database_get_distinct_string_list(self, column, namespace = None): """A private method reading an unique list of strings from the moreinfo database table""" where_str, where_args = self._generate_where_query_args( namespace = namespace) query_str = "SELECT DISTINCT %s FROM bibdocmoreinfo WHERE %s" % \ ( column, where_str, ) if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(where_args)) print "Executing query: " + query_str + " ARGS: " + repr(where_args) res = run_sql(query_str, where_args) return (res and [x[0] for x in res]) or [] # after migrating to python 2.6, can be rewritten using x if y else z syntax: return [x[0] for x in res] if res else [] def _database_get_namespaces(self): """Read the database to discover namespaces declared in a given MoreInfo""" return self._database_get_distinct_string_list("namespace") def _database_get_keys(self, namespace): """Returns all keys assigned in a given namespace of a MoreInfo instance""" return self._database_get_distinct_string_list("data_key", namespace=namespace) def _database_contains_key(self, namespace, key): return self._database_read_value(namespace, key) != None def _database_save_value(self, namespace, key, value): """Write changes into the database""" #TODO: this should happen within one transaction serialised_val = cPickle.dumps(value) # on duplicate key will not work here as miltiple null values are permitted by the index if not self._database_contains_key(namespace, key): #insert new value query_parts = [] query_args = [] to_process = [(self.docid, "id_bibdoc"), (self.version, "version"), (self.format, "format"), (self.relation, "id_rel"), (str(namespace), "namespace"), (str(key), "data_key"), (str(serialised_val), "data_value")] for entry in to_process: _val_or_null(entry[0], q_str = query_parts, q_args = query_args) columns_str = ", ".join(map(lambda x: x[1], to_process)) values_str = ", ".join(query_parts) query_str = "INSERT INTO bibdocmoreinfo (%s) VALUES(%s)" % \ (columns_str, values_str) if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(query_args)) print "Executing query: " + query_str + " ARGS: " + repr(query_args) run_sql(query_str, query_args) else: #Update existing value where_str, where_args = self._generate_where_query_args(namespace, key) query_str = "UPDATE bibdocmoreinfo SET data_value=%s WHERE " + where_str query_args = [str(serialised_val)] + where_args if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(query_args)) print "Executing query: " + query_str + " ARGS: " + repr(query_args) run_sql(query_str, query_args ) def _database_read_value(self, namespace, key): """Reads a value directly from the database @param namespace - namespace of the data to be read @param key - key of the data to be read """ where_str, where_args = self._generate_where_query_args(namespace = namespace, data_key = key) query_str = "SELECT data_value FROM bibdocmoreinfo WHERE " + where_str res = run_sql(query_str, where_args) if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(where_args) + "WITH THE RESULT: " + str(res)) s_ = "" if res: s_ = cPickle.loads(res[0][0]) print "Executing query: " + query_str + " ARGS: " + repr(where_args) + " WITH THE RESULT: " + str(s_) if res and res[0][0]: try: return cPickle.loads(res[0][0]) except: raise Exception("Error when deserialising value for %s key=%s retrieved value=%s" % (repr(self), str(key), str(res[0][0]))) return None def _database_remove_value(self, namespace, key): """Removes an entry directly in the database""" where_str, where_args = self._generate_where_query_args(namespace = namespace, data_key = key) query_str = "DELETE FROM bibdocmoreinfo WHERE " + where_str if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(where_args)) print "Executing query: " + query_str + " ARGS: " + repr(where_args) run_sql(query_str, where_args) return None def _flush_cache(self): """Writes all the dirty cache entries into the database""" for namespace in self.dirty: for data_key in self.dirty[namespace]: if namespace in self.cache and data_key in self.cache[namespace]\ and not self.cache[namespace][data_key] is None: self._database_save_value(namespace, data_key, self.cache[namespace][data_key]) else: # This might happen if a value has been removed from the cache self._database_remove_value(namespace, data_key) self.dirty = {} def _generate_where_query_args(self, namespace = None, data_key = None): """Private method generating WHERE clause of SQL statements""" ns = [] if namespace != None: ns = [(namespace, "namespace")] dk = [] if data_key != None: dk = [(data_key, "data_key")] to_process = [(self.docid, "id_bibdoc"), (self.version, "version"), (self.format, "format"), (self.relation, "id_rel")] + \ ns + dk return _sql_generate_conjunctive_where(to_process) def set_data(self, namespace, key, value): """setting data directly in the database dictionary""" if not namespace in self.cache: self.cache[namespace] = {} self.cache[namespace][key] = value self._mark_dirty(namespace, key) if not self.cache_only: self._flush_cache() def get_data(self, namespace, key): """retrieving data from the database""" if self.cache_reads or self.cache_only: if namespace in self.cache and key in self.cache[namespace]: return self.cache[namespace][key] if not self.cache_only: # we have a permission to read from the database value = self._database_read_value(namespace, key) if value: if not namespace in self.cache: self.cache[namespace] = {} self.cache[namespace][key] = value return value return None def del_key(self, namespace, key): """retrieving data from the database""" if not namespace in self.cache: return None del self.cache[namespace][key] self._mark_dirty(namespace, key) if not self.cache_only: self._flush_cache() def contains_key(self, namespace, key): return self.get_data(namespace, key) != None # the dictionary interface -> updating the default namespace def __setitem__(self, key, value): self.set_data("", key, value) #the default value def __getitem__(self, key): return self.get_data("", key) def __delitem__(self, key): self.del_key("", key) def __contains__(self, key): return self.contains_key("", key) def __repr__(self): return "MoreInfo(docid=%s, version=%s, docformat=%s, relation=%s)" % \ (self.docid, self.version, self.format, self.relation) def delete(self): """Remove all entries associated with this MoreInfo""" self.cache = {} if not self.cache_only: where_str, query_args = self._generate_where_query_args() query_str = "DELETE FROM bibdocmoreinfo WHERE %s" % (where_str, ) if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Executing query: " + query_str + " ARGS: " + repr(query_args)) print "Executing query: " + query_str + " ARGS: " + repr(query_args) run_sql(query_str, query_args) def get_cache(self): """Returns the content of the cache @return The content of the MoreInfo cache @rtype dictionary {namespace: {key1: value1, ... }, namespace2: {}} """ return self.cache def get_namespaces(self): """Returns a list of namespaces present in the MoreInfo structure. If the object is permitted access to the database, the data should be always read from there. Unlike when reading a particular value, we can not check if value is missing in the cache """ if self.cache_only and self.cache_reads: return self.cache.keys() return self._database_get_namespaces() def get_keys(self, namespace): """Returns a list of keys present in a given namespace""" if self.cache_only and self.cache_reads: res = [] if namespace in self.cache: res = self.cache[namespace].keys() return res else: return self._database_get_keys(namespace) def flush(self): """Flush the content into the database""" self._flush_cache() class BibDocMoreInfo(MoreInfo): """ This class wraps contextual information of the documents, such as the - comments - descriptions - flags. Such information is kept separately per every format/version instance of the corresponding document and is searialized in the database, ready to be retrieved (but not searched). @param docid: the document identifier. @type docid: integer @param more_info: a serialized version of an already existing more_info object. If not specified this information will be readed from the database, and othewise an empty dictionary will be allocated. @raise ValueError: if docid is not a positive integer. @ivar docid: the document identifier as passed to the constructor. @type docid: integer @ivar more_info: the more_info dictionary that will hold all the additional document information. @type more_info: dict of dict of dict @note: in general this class is never instanciated in client code and never used outside bibdocfile module. @note: this class will be extended in the future to hold all the new auxiliary information about a document. """ def __init__(self, docid, cache_only = False, initial_data = None): if not (type(docid) in (long, int) and docid > 0): raise ValueError("docid is not a positive integer, but %s." % docid) MoreInfo.__init__(self, docid, cache_only = cache_only, initial_data = initial_data) if 'descriptions' not in self: self['descriptions'] = {} if 'comments' not in self: self['comments'] = {} if 'flags' not in self: self['flags'] = {} if DBG_LOG_QUERIES: from invenio.bibtask import write_message write_message("Creating BibDocMoreInfo :" + repr(self["comments"])) print "Creating BibdocMoreInfo :" + repr(self["comments"]) def __repr__(self): """ @return: the canonical string representation of the C{BibDocMoreInfo}. @rtype: string """ return 'BibDocMoreInfo(%i, %s)' % (self.docid, repr(cPickle.dumps(self))) def set_flag(self, flagname, docformat, version): """ Sets a flag. @param flagname: the flag to set (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}). @type flagname: string @param format: the format for which the flag should set. @type format: string @param version: the version for which the flag should set: @type version: integer @raise ValueError: if the flag is not in L{CFG_BIBDOCFILE_AVAILABLE_FLAGS} """ if flagname in CFG_BIBDOCFILE_AVAILABLE_FLAGS: flags = self['flags'] if not flagname in flags: flags[flagname] = {} if not version in flags[flagname]: flags[flagname][version] = {} if not docformat in flags[flagname][version]: flags[flagname][version][docformat] = {} flags[flagname][version][docformat] = True self['flags'] = flags else: raise ValueError, "%s is not in %s" % \ (flagname, CFG_BIBDOCFILE_AVAILABLE_FLAGS) def get_comment(self, docformat, version): """ Returns the specified comment. @param format: the format for which the comment should be retrieved. @type format: string @param version: the version for which the comment should be retrieved. @type version: integer @return: the specified comment. @rtype: string """ try: assert(type(version) is int) docformat = normalize_format(docformat) return self['comments'].get(version, {}).get(docformat) except: register_exception() raise def get_description(self, docformat, version): """ Returns the specified description. @param format: the format for which the description should be retrieved. @type format: string @param version: the version for which the description should be retrieved. @type version: integer @return: the specified description. @rtype: string """ try: assert(type(version) is int) docformat = normalize_format(docformat) return self['descriptions'].get(version, {}).get(docformat) except: register_exception() raise def has_flag(self, flagname, docformat, version): """ Return True if the corresponding has been set. @param flagname: the name of the flag (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}). @type flagname: string @param format: the format for which the flag should be checked. @type format: string @param version: the version for which the flag should be checked. @type version: integer @return: True if the flag is set for the given format/version. @rtype: bool @raise ValueError: if the flagname is not in L{CFG_BIBDOCFILE_AVAILABLE_FLAGS} """ if flagname in CFG_BIBDOCFILE_AVAILABLE_FLAGS: return self['flags'].get(flagname, {}).get(version, {}).get(docformat, False) else: raise ValueError, "%s is not in %s" % (flagname, CFG_BIBDOCFILE_AVAILABLE_FLAGS) def get_flags(self, docformat, version): """ Return the list of all the enabled flags. @param format: the format for which the list should be returned. @type format: string @param version: the version for which the list should be returned. @type version: integer @return: the list of enabled flags (from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}). @rtype: list of string """ return [flag for flag in self['flags'] if docformat in self['flags'][flag].get(version, {})] def set_comment(self, comment, docformat, version): """ Set a comment. @param comment: the comment to be set. @type comment: string @param format: the format for which the comment should be set. @type format: string @param version: the version for which the comment should be set: @type version: integer """ try: assert(type(version) is int and version > 0) docformat = normalize_format(docformat) if comment == KEEP_OLD_VALUE: comment = self.get_comment(docformat, version) or self.get_comment(docformat, version - 1) if not comment: self.unset_comment(docformat, version) return if not version in self['comments']: comments = self['comments'] comments[version] = {} self['comments'] = comments comments = self['comments'] comments[version][docformat] = comment self['comments'] = comments except: register_exception() raise def set_description(self, description, docformat, version): """ Set a description. @param description: the description to be set. @type description: string @param format: the format for which the description should be set. @type format: string @param version: the version for which the description should be set: @type version: integer """ try: assert(type(version) is int and version > 0) docformat = normalize_format(docformat) if description == KEEP_OLD_VALUE: description = self.get_description(docformat, version) or self.get_description(docformat, version - 1) if not description: self.unset_description(docformat, version) return descriptions = self['descriptions'] if not version in descriptions: descriptions[version] = {} descriptions[version][docformat] = description self.set_data("", 'descriptions', descriptions) except: register_exception() raise def unset_comment(self, docformat, version): """ Unset a comment. @param format: the format for which the comment should be unset. @type format: string @param version: the version for which the comment should be unset: @type version: integer """ try: assert(type(version) is int and version > 0) comments = self['comments'] del comments[version][docformat] self['comments'] = comments except KeyError: pass except: register_exception() raise def unset_description(self, docformat, version): """ Unset a description. @param format: the format for which the description should be unset. @type format: string @param version: the version for which the description should be unset: @type version: integer """ try: assert(type(version) is int and version > 0) descriptions = self['descriptions'] del descriptions[version][docformat] self['descriptions'] = descriptions except KeyError: pass except: register_exception() raise def unset_flag(self, flagname, docformat, version): """ Unset a flag. @param flagname: the flag to be unset (see L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}). @type flagname: string @param format: the format for which the flag should be unset. @type format: string @param version: the version for which the flag should be unset: @type version: integer @raise ValueError: if the flag is not in L{CFG_BIBDOCFILE_AVAILABLE_FLAGS} """ if flagname in CFG_BIBDOCFILE_AVAILABLE_FLAGS: try: flags = self['flags'] del flags[flagname][version][docformat] self['flags'] = flags except KeyError: pass else: raise ValueError, "%s is not in %s" % (flagname, CFG_BIBDOCFILE_AVAILABLE_FLAGS) _bib_relation__any_value = -1 class BibRelation(object): """ A representation of a relation between documents or their particular versions """ def __init__(self, rel_type = None, bibdoc1_id = None, bibdoc2_id = None, bibdoc1_ver = None, bibdoc2_ver = None, bibdoc1_fmt = None, bibdoc2_fmt = None, rel_id = None): """ The constructor of the class representing a relation between two documents. If the more_info parameter is specified, no data is retrieved from the database and the internal dictionary is initialised with the passed value. If the more_info is not provided, the value is read from the database. In the case of non-existing record, an empty dictionary is assigned. If a version of whichever record is not specified, the resulting object desctibes a relation of all version of a given BibDoc. @param bibdoc1 @type bibdoc1 BibDoc @param bibdoc1_ver @type version1_ver int @param bibdoc2 @type bibdoc2 BibDco @param bibdoc2_ver @type bibdoc2_ver int @param bibdoc1_fmt format of the first document @type bibdoc1_fmt string @param bibdoc2_fmt format of the second document @type bibdoc2_fmt string @param rel_type @type rel_type string @param more_info The serialised representation of the more_info @type more_info string @param rel_id allows to specify the identifier of the newly created relation @type rel_ide unsigned int """ self.id = rel_id self.bibdoc1_id = bibdoc1_id self.bibdoc2_id = bibdoc2_id self.bibdoc1_ver = bibdoc1_ver self.bibdoc2_ver = bibdoc2_ver self.bibdoc1_fmt = bibdoc1_fmt self.bibdoc2_fmt = bibdoc2_fmt self.rel_type = rel_type if rel_id == None: self._fill_id_from_data() else: self._fill_data_from_id() self.more_info = MoreInfo(relation = self.id) def _fill_data_from_id(self): """Fill all the relation data from the relation identifier """ query = "SELECT id_bibdoc1, version1, format1, id_bibdoc2, version2, format2, rel_type FROM bibdoc_bibdoc WHERE id=%s" res = run_sql(query, (str(self.id), )) if res != None and res[0] != None: self.bibdoc1_id = res[0][0] self.bibdoc1_ver = res[0][1] self.bibdoc1_fmt = res[0][2] self.bibdoc2_id = res[0][3] self.bibdoc2_ver = res[0][4] self.bibdoc2_fmt = res[0][5] self.rel_type = res[0][6] def _fill_id_from_data(self): """Fill the relation identifier based on the data provided""" where_str, where_args = self._get_where_clauses() query = "SELECT id FROM bibdoc_bibdoc WHERE %s" % (where_str, ) res = run_sql(query, where_args) if res and res[0][0]: self.id = int(res[0][0]) def _get_value_column_mapping(self): """ Returns a list of tuples each tuple consists of a value and a name of a database column where this value should fit """ return [(self.rel_type, "rel_type"), (self.bibdoc1_id, "id_bibdoc1"), (self.bibdoc1_ver, "version1"), (self.bibdoc1_fmt, "format1"), (self.bibdoc2_id, "id_bibdoc2"), (self.bibdoc2_ver, "version2"), (self.bibdoc2_fmt, "format2")] def _get_where_clauses(self): """Private function returning part of the SQL statement identifying current relation @return @rtype tuple """ return _sql_generate_conjunctive_where(self._get_value_column_mapping()) @staticmethod def create(bibdoc1_id = None, bibdoc1_ver = None, bibdoc1_fmt = None, bibdoc2_id = None, bibdoc2_ver = None, bibdoc2_fmt = None, rel_type = ""): """ Create a relation and return instance. Ommiting an argument means that a particular relation concerns any value of the parameter """ # check if there is already entry corresponding to parameters existing = BibRelation.get_relations(rel_type = rel_type, bibdoc1_id = bibdoc1_id, bibdoc2_id = bibdoc2_id, bibdoc1_ver = bibdoc1_ver, bibdoc2_ver = bibdoc2_ver, bibdoc1_fmt = bibdoc1_fmt, bibdoc2_fmt = bibdoc2_fmt) if len(existing) > 0: return existing[0] # build the insert query and execute it to_process = [(rel_type, "rel_type"), (bibdoc1_id, "id_bibdoc1"), (bibdoc1_ver, "version1"), (bibdoc1_fmt, "format1"), (bibdoc2_id, "id_bibdoc2"), (bibdoc2_ver, "version2"), (bibdoc2_fmt, "format2")] values_list = [] args_list = [] columns_list = [] for entry in to_process: columns_list.append(entry[1]) if entry[0] == None: values_list.append("NULL") else: values_list.append("%s") args_list.append(entry[0]) query = "INSERT INTO bibdoc_bibdoc (%s) VALUES (%s)" % (", ".join(columns_list), ", ".join(values_list)) # print "Query: %s Args: %s" % (query, str(args_list)) rel_id = run_sql(query, args_list) return BibRelation(rel_id = rel_id) def delete(self): """ Removes a relation between objects from the database. executing the flush function on the same object will restore the relation """ where_str, where_args = self._get_where_clauses() run_sql("DELETE FROM bibdoc_bibdoc WHERE %s" % (where_str,), where_args) # kwalitee: disable=sql # removing associated MoreInfo self.more_info.delete() def get_more_info(self): return self.more_info @staticmethod def get_relations(rel_type = _bib_relation__any_value, bibdoc1_id = _bib_relation__any_value, bibdoc2_id = _bib_relation__any_value, bibdoc1_ver = _bib_relation__any_value, bibdoc2_ver = _bib_relation__any_value, bibdoc1_fmt = _bib_relation__any_value, bibdoc2_fmt = _bib_relation__any_value): """Retrieves list of relations satisfying condtions. If a parameter is specified, its value has to match exactly. If a parameter is ommited, any of its values will be accepted""" to_process = [(rel_type, "rel_type"), (bibdoc1_id, "id_bibdoc1"), (bibdoc1_ver, "version1"), (bibdoc1_fmt, "format1"), (bibdoc2_id, "id_bibdoc2"), (bibdoc2_ver, "version2"), (bibdoc2_fmt, "format2")] where_str, where_args = _sql_generate_conjunctive_where( filter(lambda x: x[0] != _bib_relation__any_value, to_process)) if where_str: where_str = "WHERE " + where_str # in case of nonempty where, we need a where clause query_str = "SELECT id FROM bibdoc_bibdoc %s" % (where_str, ) # print "running query : %s with arguments %s on the object %s" % (query_str, str(where_args), repr(self)) try: res = run_sql(query_str, where_args) except: raise Exception(query_str + " " + str(where_args)) results = [] if res != None: for res_row in res: results.append(BibRelation(rel_id=res_row[0])) return results # Access to MoreInfo def set_data(self, category, key, value): """assign additional information to this relation""" self.more_info.set_data(category, key, value) def get_data(self, category, key): """read additional information assigned to this relation""" return self.more_info.get_data(category, key) #the dictionary interface allowing to set data bypassing the namespaces def __setitem__(self, key, value): self.more_info[key] = value def __getitem__(self, key): return self.more_info[key] def __contains__(self, key): return self.more_info.__contains__(key) def __repr__(self): return "BibRelation(id_bibdoc1 = %s, version1 = %s, format1 = %s, id_bibdoc2 = %s, version2 = %s, format2 = %s, rel_type = %s)" % \ (self.bibdoc1_id, self.bibdoc1_ver, self.bibdoc1_fmt, self.bibdoc2_id, self.bibdoc2_ver, self.bibdoc2_fmt, self.rel_type) def readfile(filename): """ Read a file. @param filename: the name of the file to be read. @type filename: string @return: the text contained in the file. @rtype: string @note: Returns empty string in case of any error. @note: this function is useful for quick implementation of websubmit functions. """ try: return open(filename).read() except Exception: return '' class HeadRequest(urllib2.Request): """ A request object to perform a HEAD request. """ def get_method(self): return 'HEAD' def read_cookie(cookiefile): """ Parses a cookie file and returns a string as needed for the urllib2 headers The file should respect the Netscape cookie specifications """ cookie_data = '' cfile = open(cookiefile, 'r') for line in cfile.readlines(): tokens = line.split('\t') if len(tokens) == 7: # we are on a cookie line cookie_data += '%s=%s; ' % (tokens[5], tokens[6].replace('\n', '')) cfile.close() return cookie_data def open_url(url, headers=None, head_request=False): """ Opens a URL. If headers are passed as argument, no check is performed and the URL will be opened. Otherwise checks if the URL is present in CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS and uses the headers specified in the config variable. @param url: the URL to open @type url: string @param headers: the headers to use @type headers: dictionary @param head_request: if True, perform a HEAD request, otherwise a POST request @type head_request: boolean @return: a file-like object as returned by urllib2.urlopen. """ headers_to_use = None if headers is None: for regex, headers in _CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS: if regex.match(url) is not None: headers_to_use = headers break if headers_to_use is None: # URL is not allowed. raise InvenioBibdocfileUnauthorizedURL, "%s is not an authorized " \ "external URL." % url else: headers_to_use = headers request_obj = head_request and HeadRequest or urllib2.Request request = request_obj(url) request.add_header('User-Agent', make_user_agent_string('bibdocfile')) for key, value in headers_to_use.items(): try: value = globals()[value['fnc']](**value['args']) except (KeyError, TypeError): pass request.add_header(key, value) return urllib2.urlopen(request) def update_modification_date_of_file(filepath, modification_date): """Update the modification time and date of the file with the modification_date @param filepath: the full path of the file that needs to be updated @type filepath: string @param modification_date: the new modification date and time @type modification_date: datetime.datetime object """ try: modif_date_in_seconds = time.mktime(modification_date.timetuple()) # try to get the time in seconds except (AttributeError, TypeError): modif_date_in_seconds = 0 if modif_date_in_seconds: statinfo = os.stat(filepath) # we need to keep the same access time os.utime(filepath, (statinfo.st_atime, modif_date_in_seconds)) #update the modification time diff --git a/modules/bibformat/lib/bibreformat.py b/modules/bibformat/lib/bibreformat.py index 9c712dcbd..9c51e122f 100644 --- a/modules/bibformat/lib/bibreformat.py +++ b/modules/bibformat/lib/bibreformat.py @@ -1,509 +1,509 @@ # -*- mode: python; coding: utf-8; -*- # # This file is part of Invenio. -# Copyright (C) 2007, 2008, 2010, 2011, 2012, 2014 CERN. +# Copyright (C) 2007, 2008, 2010, 2011, 2012, 2014, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Call BibFormat engine and create HTML brief (and other) formats cache for bibliographic records.""" __revision__ = "$Id$" import os from datetime import datetime from invenio.dbquery import run_sql from invenio.intbitset import intbitset from invenio.search_engine import perform_request_search, search_pattern from invenio.bibrank_citation_searcher import get_cited_by from invenio.bibrank_citation_indexer import get_bibrankmethod_lastupdate from invenio.bibformat_dblayer import save_preformatted_record from invenio.shellutils import split_cli_ids_arg from invenio.bibfield import get_record from invenio.bibtask import task_init, \ write_message, \ task_set_option, \ task_get_option, \ task_update_progress, \ task_has_option, \ task_sleep_now_if_required from invenio.bibformat_engine import format_record_1st_pass def fetch_last_updated(fmt): select_sql = "SELECT last_updated FROM format WHERE code = %s" row = run_sql(select_sql, (fmt.lower(), )) # Fallback in case we receive None instead of a valid date last_date = row[0][0] or datetime(year=1900, month=1, day=1) return last_date def store_last_updated(fmt, iso_date): sql = "UPDATE format SET last_updated = %s " \ "WHERE code = %s AND (last_updated < %s or last_updated IS NULL)" run_sql(sql, (iso_date, fmt.lower(), iso_date)) ### run the bibreformat task bibsched scheduled ### def bibreformat_task(fmt, recids, without_fmt, process): """BibReformat main task. @param fmt: output format to use @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") latest_bibrank_run = get_bibrankmethod_lastupdate('citation') def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime( "%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([recid for recid, mod_date in run_sql(sql) if check_date(mod_date)]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids def recid_chunker(recids): recids_processed = intbitset() chunk = intbitset() for recid in recids: if len(chunk) == 5000: for r in related_records(chunk, recids_processed): yield r recids_processed += chunk chunk = intbitset() if recid not in recids_processed: chunk.add(recid) if chunk: for r in related_records(chunk, recids_processed): yield r recIDs = list(recid_chunker(recids)) ### list of corresponding record IDs was retrieved ### now format the selected records if without_fmt: write_message("Records to be processed: %d" % len(recIDs)) write_message("Out of it records without existing cache: %d" % len(without_fmt)) else: write_message("Records to be processed: %d" % len(recIDs)) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message) def check_validity_input_formats(input_formats): """Check the validity of every input format. @param input_formats: list of given formats @type input_formats: list @return: if there is any invalid input format it returns this value @rtype: string """ from invenio.search_engine import get_available_output_formats valid_formats = get_available_output_formats() # let's to extract the values of the available formats format_values = [] for aformat in valid_formats: format_values.append(aformat['value']) invalid_format = '' for aformat in input_formats: if aformat.lower() not in format_values: invalid_format = aformat.lower() break return invalid_format ### Bibreformat all selected records (using new python bibformat) ### (see iterate_over_old further down) def _update_recjson_format(recid, *args, **kwargs): """Update RECJSON cache. :param int recid: record id to process """ dummy = get_record(recid, reset_cache=True) def _update_format(recid, fmt): """Usual format update procedure, gets the formatted record and saves it. :param int recid: record id to process :param str fmt: format to update/create, i.e. 'HB' """ record, needs_2nd_pass = format_record_1st_pass(recID=recid, of=fmt, on_the_fly=True, save_missing=False) save_preformatted_record(recID=recid, of=fmt, res=record, needs_2nd_pass=needs_2nd_pass, low_priority=True) _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS = {'recjson': _update_recjson_format} """Specific functions to be used for each format if needed. If not set `_update_format` will be used. """ def iterate_over_new(recIDs, fmt): """Iterate over list of IDs. @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call tot = len(recIDs) reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get( fmt.lower(), _update_format) for count, recID in enumerate(recIDs): t1 = os.times()[4] reformat_function(recID, fmt) t2 = os.times()[4] tbibformat += t2 - t1 if count % 100 == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if tot % 100 != 0: write_message(" ... formatted %s records out of %s" % (tot, tot)) return tot, tbibformat, tbibupload def all_records(): """Produce record IDs for all available records.""" return intbitset(run_sql("SELECT id FROM bibrec")) def outdated_caches(fmt, last_updated, chunk_size=5000): sql = """SELECT br.id FROM bibrec AS br INNER JOIN bibfmt AS bf ON bf.id_bibrec = br.id WHERE br.modification_date >= %s AND bf.format = %s AND bf.last_updated < br.modification_date AND br.id BETWEEN %s AND %s""" last_updated_str = last_updated.strftime('%Y-%m-%d %H:%M:%S') recids = intbitset() max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] or 0 for start in xrange(1, max_id + 1, chunk_size): end = start + chunk_size recids += intbitset(run_sql(sql, (last_updated_str, fmt, start, end))) return recids def missing_caches(fmt, chunk_size=100000): """Produce record IDs to be formated, because their fmt cache is missing. @param fmt: format to query for @return: record IDs generator without pre-created format cache """ write_message("Querying database for records without cache...") all_recids = intbitset() max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] or 0 for start in xrange(1, max_id + 1, chunk_size): end = start + chunk_size sql = "SELECT id FROM bibrec WHERE id BETWEEN %s AND %s" recids = intbitset(run_sql(sql, (start, end))) sql = """SELECT id_bibrec FROM bibfmt WHERE id_bibrec BETWEEN %s AND %s AND format = %s""" without_fmt = intbitset(run_sql(sql, (start, end, fmt))) all_recids += recids - without_fmt return all_recids def query_records(params): """Produce record IDs from given query parameters. By passing the appriopriate CLI options, we can query here for additional records. """ write_message("Querying database (records query)...") res = intbitset() if params['field'] or params['collection'] or params['pattern']: if not params['collection']: # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=params['pattern'], f=params['field'], m=params['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset(perform_request_search(req=None, of='id', c=params['collection'], p=params['pattern'], f=params['field'])) return res def task_run_core(): """Run the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call. """ fmts = task_get_option('format', 'HB,RECJSON') for fmt in fmts.split(','): last_updated = fetch_last_updated(fmt) write_message("last stored run date is %s" % last_updated) recids = intbitset() if task_has_option("all"): recids += all_records() if task_has_option("last"): recids += outdated_caches(fmt, last_updated) if task_has_option('ignore_without') or \ task_has_option('collection') or \ task_has_option('field') or \ task_has_option('pattern') or \ task_has_option('recids'): without_fmt = intbitset() else: without_fmt = missing_caches(fmt) recids += without_fmt cli_recids = split_cli_ids_arg(task_get_option('recids', '')) recids += cli_recids query_params = {'collection': task_get_option('collection', ''), 'field': task_get_option('field', ''), 'pattern': task_get_option('pattern', ''), 'matching': task_get_option('matching', '')} recids += query_records(query_params) bibreformat_task(fmt, recids, without_fmt, not task_has_option('noprocess')) return True def main(): """Main that construct all the bibtask.""" task_init(authorization_action='runbibformat', authorization_msg="BibReformat Task Submission", description=""" BibReformat formats the records and saves the produced outputs for later retrieval. BibReformat is usually run periodically via BibSched in order to (1) format new records in the database and to (2) reformat records for which the meta data has been modified. BibReformat has to be run manually when (3) format config files have been modified, in order to see the changes in the web interface. Although it is not necessary to run BibReformat to display formatted records in the web interface, BibReformat allows to improve serving speed by precreating the outputs. It is suggested to run BibReformat for 'HB' output. Option -m cannot be used at the same time as option -c. Option -c prevents from finding records in private collections. Examples: bibreformat Format all new or modified records (in HB and RECJSON). bibreformat -o HD Format all new or modified records in HD. bibreformat -o HD,HB Format all new or modified records in HD and HB. bibreformat -a Force reformatting all records (in HB). bibreformat -c 'Photos' Force reformatting all records in 'Photos' collection (in HB). bibreformat -c 'Photos' -o HD Force reformatting all records in 'Photos' collection in HD. bibreformat -i 15 Force reformatting record 15 (in HB). bibreformat -i 15:20 Force reformatting records 15 to 20 (in HB). bibreformat -i 15,16,17 Force reformatting records 15, 16 and 17 (in HB). bibreformat -n Show how many records are to be (re)formatted. bibreformat -n -c 'Articles' Show how many records are to be (re)formatted in 'Articles' collection. bibreformat -oHB -s1h Format all new and modified records every hour, in HB. """, help_specific_usage=""" -o, --formats \t Specify output format/s (default HB) -n, --noprocess \t Count records to be formatted (no processing done) Reformatting options: -a, --all \t Force reformatting all records -c, --collection \t Force reformatting records by collection -f, --field \t Force reformatting records by field -p, --pattern \t Force reformatting records by pattern -i, --id \t Force reformatting records by record id(s) --no-missing \t Ignore reformatting records without format Pattern options: -m, --matching \t Specify if pattern is exact (e), regular expression (r), \t partial (p), any of the words (o) or all of the words (a) """, version=__revision__, specific_params=("ac:f:p:lo:nm:i:", ["all", "collection=", "matching=", "field=", "pattern=", "format=", "noprocess", "id=", "no-missing"]), task_submit_check_options_fnc=task_submit_check_options, task_submit_elaborate_specific_parameter_fnc= task_submit_elaborate_specific_parameter, task_run_fnc=task_run_core) def task_submit_check_options(): """Last checks and updating on the options...""" if not (task_has_option('all') or task_has_option('collection') or task_has_option('field') or task_has_option('pattern') or task_has_option('matching') or task_has_option('recids')): task_set_option('last', 1) return True def task_submit_elaborate_specific_parameter(key, value, opts, args): # pylint: disable-msg=W0613 """ Elaborate specific CLI parameters of BibReformat. @param key: a parameter key to check @param value: a value associated to parameter X{Key} @return: True for known X{Key} else False. """ if key in ("-a", "--all"): task_set_option("all", 1) elif key in ("--no-missing", ): task_set_option("ignore_without", 1) elif key in ("-c", "--collection"): task_set_option("collection", value) elif key in ("-n", "--noprocess"): task_set_option("noprocess", 1) elif key in ("-f", "--field"): task_set_option("field", value) elif key in ("-p", "--pattern"): task_set_option("pattern", value) elif key in ("-m", "--matching"): task_set_option("matching", value) elif key in ("-o", "--format"): input_formats = value.split(',') # check the validity of the given output formats invalid_format = check_validity_input_formats(input_formats) if invalid_format: try: raise Exception('Invalid output format.') except Exception: # pylint: disable-msg=W0703 from invenio.errorlib import register_exception register_exception( prefix="The given output format '%s' is not available or " "is invalid. Please try again" % (invalid_format, ), alert_admin=True) return else: # every given format is available task_set_option("format", value) elif key in ("-i", "--id"): task_set_option("recids", value) else: return False return True ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/bibindex/lib/bibindex_engine.py b/modules/bibindex/lib/bibindex_engine.py index 7d7333af7..f4b2e2adc 100644 --- a/modules/bibindex/lib/bibindex_engine.py +++ b/modules/bibindex/lib/bibindex_engine.py @@ -1,2314 +1,2314 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. # Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, -# 2010, 2011, 2012, 2013, 2014, 2015 CERN. +# 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibIndex indexing engine implementation. See bibindex executable for entry point. """ __revision__ = "$Id$" import re import sys import time import fnmatch import inspect from datetime import datetime from invenio.config import CFG_SOLR_URL from invenio.bibindex_engine_config import CFG_MAX_MYSQL_THREADS, \ CFG_MYSQL_THREAD_TIMEOUT, \ CFG_CHECK_MYSQL_THREADS, \ CFG_BIBINDEX_INDEX_TABLE_TYPE, \ CFG_BIBINDEX_ADDING_RECORDS_STARTED_STR, \ CFG_BIBINDEX_UPDATE_MESSAGE, \ CFG_BIBINDEX_UPDATE_MODE, \ CFG_BIBINDEX_TOKENIZER_TYPE, \ CFG_BIBINDEX_WASH_INDEX_TERMS, \ CFG_BIBINDEX_SPECIAL_TAGS from invenio.bibauthority_config import \ CFG_BIBAUTHORITY_CONTROLLED_FIELDS_BIBLIOGRAPHIC from invenio.bibauthority_engine import \ get_control_nos_from_recID from invenio.bibauthorid_dbinterface import get_author_canonical_ids_for_recid from invenio.search_engine import perform_request_search, \ get_index_stemming_language, \ get_synonym_terms, \ search_pattern, \ search_unit_in_bibrec from invenio.dbquery import run_sql, DatabaseError, serialize_via_marshal, \ deserialize_via_marshal, wash_table_column_name from invenio.bibindex_engine_washer import wash_index_term from invenio.bibtask import task_init, write_message, get_datetime, \ task_set_option, task_get_option, task_get_task_param, \ task_update_progress, task_sleep_now_if_required from invenio.intbitset import intbitset from invenio.errorlib import register_exception from invenio.solrutils_bibindex_indexer import solr_commit from invenio.bibindex_tokenizers.BibIndexJournalTokenizer import \ CFG_JOURNAL_TAG, \ CFG_JOURNAL_PUBINFO_STANDARD_FORM, \ CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK from invenio.bibindex_termcollectors import TermCollector from invenio.bibindex_engine_utils import load_tokenizers, \ get_all_index_names_and_column_values, \ get_index_tags, \ get_field_tags, \ get_marc_tag_indexes, \ get_nonmarc_tag_indexes, \ get_all_indexes, \ get_index_virtual_indexes, \ get_virtual_index_building_blocks, \ get_index_id_from_index_name, \ run_sql_drop_silently, \ get_min_last_updated, \ remove_inexistent_indexes, \ get_all_synonym_knowledge_bases, \ get_index_remove_stopwords, \ get_index_remove_html_markup, \ get_index_remove_latex_markup, \ filter_for_virtual_indexes, \ get_records_range_for_index, \ make_prefix, \ list_union, \ recognize_marc_tag from invenio.bibindex_termcollectors import \ TermCollector, \ NonmarcTermCollector from invenio.memoiseutils import Memoise if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 # precompile some often-used regexp for speed reasons: re_subfields = re.compile('\$\$\w') re_datetime_shift = re.compile("([-\+]{0,1})([\d]+)([dhms])") re_prefix = re.compile('__[a-zA-Z1-9]*__') nb_char_in_line = 50 # for verbose pretty printing chunksize = 1000 # default size of chunks that the records will be treated by base_process_size = 4500 # process base size _last_word_table = None _TOKENIZERS = load_tokenizers() def list_unique(_list): """Returns a _list with duplicates removed.""" _dict = {} for e in _list: _dict[e] = 1 return _dict.keys() # safety function for killing slow DB threads: def kill_sleepy_mysql_threads(max_threads=CFG_MAX_MYSQL_THREADS, thread_timeout=CFG_MYSQL_THREAD_TIMEOUT): """Check the number of DB threads and if there are more than MAX_THREADS of them, lill all threads that are in a sleeping state for more than THREAD_TIMEOUT seconds. (This is useful for working around the the max_connection problem that appears during indexation in some not-yet-understood cases.) If some threads are to be killed, write info into the log file. """ res = run_sql("SHOW FULL PROCESSLIST") if len(res) > max_threads: for row in res: r_id, dummy, dummy, dummy, r_command, r_time, dummy, dummy = row if r_command == "Sleep" and int(r_time) > thread_timeout: run_sql("KILL %s", (r_id, )) write_message("WARNING: too many DB threads, " + \ "killing thread %s" % r_id, verbose=1) return def get_associated_subfield_value(recID, tag, value, associated_subfield_code): """Return list of ASSOCIATED_SUBFIELD_CODE, if exists, for record RECID and TAG of value VALUE. Used by fulltext indexer only. Note: TAG must be 6 characters long (tag+ind1+ind2+sfcode), otherwise en empty string is returned. FIXME: what if many tag values have the same value but different associated_subfield_code? Better use bibrecord library for this. """ out = "" if len(tag) != 6: return out bibXXx = "bib" + tag[0] + tag[1] + "x" bibrec_bibXXx = "bibrec_" + bibXXx query = """SELECT bb.field_number, b.tag, b.value FROM %s AS b, %s AS bb WHERE bb.id_bibrec=%%s AND bb.id_bibxxx=b.id AND tag LIKE %%s%%""" % (bibXXx, bibrec_bibXXx) res = run_sql(query, (recID, tag[:-1])) field_number = -1 for row in res: if row[1] == tag and row[2] == value: field_number = row[0] if field_number > 0: for row in res: if row[0] == field_number and row[1] == tag[:-1] + associated_subfield_code: out = row[2] break return out def swap_temporary_reindex_tables(index_id, reindex_prefix="tmp_"): """Atomically swap reindexed temporary table with the original one. Delete the now-old one.""" write_message("Putting new tmp index tables " + \ "for id %s into production" % index_id) run_sql( "RENAME TABLE " + "idxWORD%02dR TO old_idxWORD%02dR," % (index_id, index_id) + "%sidxWORD%02dR TO idxWORD%02dR," % (reindex_prefix, index_id, index_id) + "idxWORD%02dF TO old_idxWORD%02dF," % (index_id, index_id) + "%sidxWORD%02dF TO idxWORD%02dF," % (reindex_prefix, index_id, index_id) + "idxPAIR%02dR TO old_idxPAIR%02dR," % (index_id, index_id) + "%sidxPAIR%02dR TO idxPAIR%02dR," % (reindex_prefix, index_id, index_id) + "idxPAIR%02dF TO old_idxPAIR%02dF," % (index_id, index_id) + "%sidxPAIR%02dF TO idxPAIR%02dF," % (reindex_prefix, index_id, index_id) + "idxPHRASE%02dR TO old_idxPHRASE%02dR," % (index_id, index_id) + "%sidxPHRASE%02dR TO idxPHRASE%02dR," % (reindex_prefix, index_id, index_id) + "idxPHRASE%02dF TO old_idxPHRASE%02dF," % (index_id, index_id) + "%sidxPHRASE%02dF TO idxPHRASE%02dF;" % (reindex_prefix, index_id, index_id) ) write_message("Dropping old index tables for id %s" % index_id) run_sql_drop_silently("""DROP TABLE old_idxWORD%02dR, old_idxWORD%02dF, old_idxPAIR%02dR, old_idxPAIR%02dF, old_idxPHRASE%02dR, old_idxPHRASE%02dF""" % ((index_id, )* 6) ) # kwalitee: disable=sql def init_temporary_reindex_tables(index_id, reindex_prefix="tmp_"): """Create reindexing temporary tables.""" write_message("Creating new tmp index tables for id %s" % index_id) query = """DROP TABLE IF EXISTS %sidxWORD%02dF""" % \ (wash_table_column_name(reindex_prefix), index_id) run_sql_drop_silently(query) # kwalitee: disable=sql run_sql("""CREATE TABLE %sidxWORD%02dF ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM""" % (reindex_prefix, index_id)) query = """DROP TABLE IF EXISTS %sidxWORD%02dR""" % \ (wash_table_column_name(reindex_prefix), index_id) run_sql_drop_silently(query) # kwalitee: disable=sql run_sql("""CREATE TABLE %sidxWORD%02dR ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM""" % (reindex_prefix, index_id)) query = """DROP TABLE IF EXISTS %sidxPAIR%02dF""" % \ (wash_table_column_name(reindex_prefix), index_id) run_sql_drop_silently(query) # kwalitee: disable=sql run_sql("""CREATE TABLE %sidxPAIR%02dF ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM""" % (reindex_prefix, index_id)) query = """DROP TABLE IF EXISTS %sidxPAIR%02dR""" % \ (wash_table_column_name(reindex_prefix), index_id) run_sql_drop_silently(query) # kwalitee: disable=sql run_sql("""CREATE TABLE %sidxPAIR%02dR ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM""" % (reindex_prefix, index_id)) query = """DROP TABLE IF EXISTS %sidxPHRASE%02dF""" % \ (wash_table_column_name(reindex_prefix), index_id) run_sql_drop_silently(query) # kwalitee: disable=sql run_sql("""CREATE TABLE %sidxPHRASE%02dF ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM""" % (reindex_prefix, index_id)) query = """DROP TABLE IF EXISTS %sidxPHRASE%02dR""" % \ (wash_table_column_name(reindex_prefix), index_id) run_sql_drop_silently(query) # kwalitee: disable=sql run_sql("""CREATE TABLE %sidxPHRASE%02dR ( id_bibrec mediumint(9) unsigned NOT NULL default '0', termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM""" % (reindex_prefix, index_id)) def remove_subfields(s): "Removes subfields from string, e.g. 'foo $$c bar' becomes 'foo bar'." return re_subfields.sub(' ', s) def get_field_indexes(field): """Returns indexes names and ids corresponding to the given field""" if recognize_marc_tag(field): #field is actually a tag return get_marc_tag_indexes(field, virtual=False) else: return get_nonmarc_tag_indexes(field, virtual=False) get_field_indexes_memoised = Memoise(get_field_indexes) def get_index_tokenizer(index_id): """Returns value of a tokenizer field from idxINDEX database table @param index_id: id of the index """ query = "SELECT tokenizer FROM idxINDEX WHERE id=%s" % index_id out = None try: res = run_sql(query) if res: out = _TOKENIZERS[res[0][0]] except DatabaseError: write_message("Exception caught for SQL statement: %s; " + \ "column tokenizer might not exist" % query, sys.stderr) except KeyError: write_message("Exception caught: there is no such tokenizer") out = None return out def detect_tokenizer_type(tokenizer): """ Checks what is the main type of the tokenizer. For more information on tokenizer types take a look at BibIndexTokenizer class. @param tokenizer: instance of a tokenizer """ from invenio.bibindex_tokenizers.BibIndexStringTokenizer import BibIndexStringTokenizer from invenio.bibindex_tokenizers.BibIndexRecJsonTokenizer import BibIndexRecJsonTokenizer from invenio.bibindex_tokenizers.BibIndexMultiFieldTokenizer import BibIndexMultiFieldTokenizer tokenizer_inheritance_tree = inspect.getmro(tokenizer.__class__) if BibIndexStringTokenizer in tokenizer_inheritance_tree: return CFG_BIBINDEX_TOKENIZER_TYPE['string'] if BibIndexMultiFieldTokenizer in tokenizer_inheritance_tree: return CFG_BIBINDEX_TOKENIZER_TYPE['multifield'] if BibIndexRecJsonTokenizer in tokenizer_inheritance_tree: return CFG_BIBINDEX_TOKENIZER_TYPE['recjson'] return CFG_BIBINDEX_TOKENIZER_TYPE['unknown'] def get_last_updated_all_indexes(): """Returns last modification date for all defined indexes""" query= """SELECT name, last_updated FROM idxINDEX""" res = run_sql(query) return res def split_ranges(parse_string): """Parse a string a return the list or ranges.""" recIDs = [] ranges = parse_string.split(",") for arange in ranges: tmp_recIDs = arange.split("-") if len(tmp_recIDs) == 1: recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])]) else: if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check tmp = tmp_recIDs[0] tmp_recIDs[0] = tmp_recIDs[1] tmp_recIDs[1] = tmp recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])]) return recIDs def get_word_tables(tables): """ Given a list of table names it return a list of tuples (index_id, index_name, index_tags). """ wordTables = [] if tables: for index in tables: index_id = get_index_id_from_index_name(index) if index_id: wordTables.append((index_id, index, get_index_tags(index))) else: write_message("Error: There is no %s words table." % \ index, sys.stderr) return wordTables def get_date_range(var): "Returns the two dates contained as a low,high tuple" limits = var.split(",") if len(limits) == 1: low = get_datetime(limits[0]) return low, None if len(limits) == 2: low = get_datetime(limits[0]) high = get_datetime(limits[1]) return low, high return None, None def create_range_list(res): """Creates a range list from a recID select query result contained in res. The result is expected to have ascending numerical order.""" if not res: return [] row = res[0] if not row: return [] else: range_list = [[row, row]] for row in res[1:]: row_id = row if row_id == range_list[-1][1] + 1: range_list[-1][1] = row_id else: range_list.append([row_id, row_id]) return range_list def beautify_range_list(range_list): """Returns a non overlapping, maximal range list""" ret_list = [] for new in range_list: found = 0 for old in ret_list: if new[0] <= old[0] <= new[1] + 1 or new[0] - 1 <= old[1] <= new[1]: old[0] = min(old[0], new[0]) old[1] = max(old[1], new[1]) found = 1 break if not found: ret_list.append(new) return ret_list def truncate_index_table(index_name): """Properly truncate the given index.""" index_id = get_index_id_from_index_name(index_name) if index_id: write_message('Truncating %s index table in order to reindex.' % \ index_name, verbose=2) run_sql("""UPDATE idxINDEX SET last_updated='0000-00-00 00:00:00' WHERE id=%s""", (index_id, )) run_sql("TRUNCATE idxWORD%02dF" % index_id) # kwalitee: disable=sql run_sql("TRUNCATE idxWORD%02dR" % index_id) # kwalitee: disable=sql run_sql("TRUNCATE idxPHRASE%02dF" % index_id) # kwalitee: disable=sql run_sql("TRUNCATE idxPHRASE%02dR" % index_id) # kwalitee: disable=sql def update_index_last_updated(indexes, starting_time=None): """Update last_updated column of the index table in the database. Puts starting time there so that if the task was interrupted for record download, the records will be reindexed next time. @param indexes: list of indexes names """ if starting_time is None: return None for index_name in indexes: write_message("updating last_updated to %s...for %s index" % \ (starting_time, index_name), verbose=9) run_sql("UPDATE idxINDEX SET last_updated=%s WHERE name=%s", (starting_time, index_name)) def get_percentage_completed(num_done, num_total): """ Return a string containing the approx. percentage completed """ percentage_remaining = 100.0 * float(num_done) / float(num_total) if percentage_remaining: percentage_display = "(%.1f%%)" % (percentage_remaining, ) else: percentage_display = "" return percentage_display def _fill_dict_of_indexes_with_empty_sets(): """find_affected_records internal function. Creates dict: {'index_name1':set([]), ...} """ index_dict = {} tmp_all_indexes = get_all_indexes(virtual=False) for index in tmp_all_indexes: index_dict[index] = set([]) return index_dict def find_affected_records_for_index(indexes=None, recIDs=None, force_all_indexes=False): """ Function checks which records need to be changed/reindexed for given index/indexes. Makes use of hstRECORD table where different revisions of record are kept. If parameter force_all_indexes is set function will assign all recIDs to all indexes. @param indexes: names of indexes for reindexation separated by coma @param recIDs: recIDs for reindexation in form: [[range1_down, range1_up],[range2_down, range2_up]..] @param force_all_indexes: should we index all indexes? """ if indexes is None: indexes = [] if recIDs is None: recIDs = [] tmp_dates = dict(get_last_updated_all_indexes()) modification_dates = dict([(date, tmp_dates[date] or datetime(1000, 1, 1, 1, 1, 1)) for date in tmp_dates]) tmp_all_indexes = get_all_indexes(virtual=False) indexes = remove_inexistent_indexes(indexes, leave_virtual=False) if not indexes: return {} def _should_reindex_for_revision(index_name, revision_date): try: if modification_dates[index_name] < revision_date and \ index_name in indexes: return True return False except KeyError: return False if force_all_indexes: records_for_indexes = {} all_recIDs = [] for recIDs_range in recIDs: all_recIDs.extend(range(recIDs_range[0], recIDs_range[1]+1)) for index in indexes: records_for_indexes[index] = all_recIDs return records_for_indexes min_last_updated = get_min_last_updated(indexes)[0][0] or \ datetime(1000, 1, 1, 1, 1, 1) recIDs_info = [] for recIDs_range in recIDs: # firstly, determine which records were updated since min_last_updated: query = """SELECT id_bibrec,job_date,affected_fields FROM hstRECORD WHERE id_bibrec BETWEEN %s AND %s AND job_date > '%s'""" % \ (recIDs_range[0], recIDs_range[1], min_last_updated) res = run_sql(query) if res: recIDs_info.extend(res) # secondly, there may be newly inserted records which were # uploaded with old timestamp (via 005), so let us detect # those too, using their "real" modification_date: res = run_sql("""SELECT bibrec.id,modification_date,'' FROM bibrec, hstRECORD WHERE modification_date>%s AND bibrec.id=id_bibrec AND (SELECT COUNT(*) FROM hstRECORD WHERE id_bibrec=bibrec.id)=1""", (min_last_updated,)) if res: recIDs_info.extend(res) indexes_to_change = _fill_dict_of_indexes_with_empty_sets() for recID_info in recIDs_info: recID, revision, affected_fields = recID_info affected_fields = affected_fields.split(",") indexes_for_recID = set() for field in affected_fields: if field: field_indexes = get_field_indexes_memoised(field) or [] indexes_names = set([idx[1] for idx in field_indexes]) indexes_for_recID |= indexes_names else: # record was inserted, all fields were changed, # no specific affected fields indexes_for_recID |= set(tmp_all_indexes) indexes_for_recID_filtered = [ind for ind in indexes_for_recID if _should_reindex_for_revision(ind, revision)] for index in indexes_for_recID_filtered: indexes_to_change[index].add(recID) indexes_to_change = dict((k, list(sorted(v))) for k, v in indexes_to_change.iteritems() if v) return indexes_to_change def chunk_generator(rng): """ Splits one range into several smaller ones with respect to global chunksize variable. @param rng: range of records @type rng: list in the form: [1, 2000] """ global chunksize current_low = rng[0] current_high = rng[0] if rng[0] == None or rng[1] == None: raise StopIteration if rng[1] - rng[0] + 1 <= chunksize: yield rng else: while current_high - 1 < rng[1]: current_high += chunksize yield current_low, min(current_high - 1, rng[1]) current_low += chunksize class AbstractIndexTable(object): """ This class represents an index table in database. An index consists of three different kinds of tables: table which stores only words in db, table which stores pairs of words and table which stores whole phrases. The class represents only one table. Another instance of the class must be created in order to store different type of terms. This class is an abstract class. It contains methods to connect to db and methods which facilitate inserting/modifing/removing terms from it. The class also contains methods which help managing the memory. All specific methods for indexing can be found in corresponding classes for virtual and regular indexes. """ def __init__(self, index_name, table_type, table_prefix="", wash_index_terms=50): self.index_name = index_name self.index_id = get_index_id_from_index_name(index_name) self.table_type = table_type self.wash_index_terms = wash_index_terms self.table_name = wash_table_column_name(table_prefix + \ "idx" + \ table_type + \ ("%02d" % self.index_id) + "F") self.table_prefix = table_prefix self.value = {} # cache self.recIDs_in_mem = [] def put_into_db(self, mode="normal"): """Updates the current words table in the corresponding DB idxFOO table. Mode 'normal' means normal execution, mode 'emergency' means words index reverting to old state. """ write_message("%s %s wordtable flush started" % \ (self.table_name, mode)) write_message('...updating %d words into %s started' % \ (len(self.value), self.table_name)) task_update_progress("(%s:%s) flushed %d/%d words" % \ (self.table_name, self.index_name, 0, len(self.value))) self.recIDs_in_mem = beautify_range_list(self.recIDs_in_mem) tab_name = self.table_name[:-1] + "R" if mode == "normal": for group in self.recIDs_in_mem: query = """UPDATE %s SET type='TEMPORARY' WHERE id_bibrec BETWEEN %%s AND %%s AND type='CURRENT'""" % tab_name write_message(query % (group[0], group[1]), verbose=9) run_sql(query, (group[0], group[1])) nb_words_total = len(self.value) nb_words_report = int(nb_words_total / 10.0) nb_words_done = 0 for word in self.value.keys(): self.put_word_into_db(word) nb_words_done += 1 if nb_words_report != 0 and ((nb_words_done % nb_words_report) == 0): write_message('......processed %d/%d words' % \ (nb_words_done, nb_words_total)) percentage_display = get_percentage_completed(nb_words_done, nb_words_total) task_update_progress("(%s:%s) flushed %d/%d words %s" % \ (tab_name, self.index_name, nb_words_done, nb_words_total, percentage_display)) write_message('...updating %d words into %s ended' % \ (nb_words_total, tab_name)) write_message('...updating reverse table %s started' % tab_name) if mode == "normal": for group in self.recIDs_in_mem: query = """UPDATE %s SET type='CURRENT' WHERE id_bibrec BETWEEN %%s AND %%s AND type='FUTURE'""" % tab_name write_message(query % (group[0], group[1]), verbose=9) run_sql(query, (group[0], group[1])) query = """DELETE FROM %s WHERE id_bibrec BETWEEN %%s AND %%s AND type='TEMPORARY'""" % tab_name write_message(query % (group[0], group[1]), verbose=9) run_sql(query, (group[0], group[1])) write_message('End of updating wordTable into %s' % \ tab_name, verbose=9) elif mode == "emergency": for group in self.recIDs_in_mem: query = """UPDATE %s SET type='CURRENT' WHERE id_bibrec BETWEEN %%s AND %%s AND type='TEMPORARY'""" % tab_name write_message(query % (group[0], group[1]), verbose=9) run_sql(query, (group[0], group[1])) query = """DELETE FROM %s WHERE id_bibrec BETWEEN %%s AND %%s AND type='FUTURE'""" % tab_name write_message(query % (group[0], group[1]), verbose=9) run_sql(query, (group[0], group[1])) write_message('End of emergency flushing wordTable into %s' % \ tab_name, verbose=9) write_message('...updating reverse table %s ended' % tab_name) self.clean() self.recIDs_in_mem = [] write_message("%s %s wordtable flush ended" % \ (self.table_name, mode)) task_update_progress("(%s:%s) flush ended" % \ (self.table_name, self.index_name)) def put_word_into_db(self, word): """Flush a single word to the database and delete it from memory""" set = self.load_old_recIDs(word) if set is not None: # merge the word recIDs found in memory: hitlist_was_changed = self.merge_with_old_recIDs(word, set) if not hitlist_was_changed: # nothing to update: write_message("......... unchanged hitlist for ``%s''" % \ word, verbose=9) else: # yes there were some new words: write_message("......... updating hitlist for ``%s''" % \ word, verbose=9) - run_sql("UPDATE %s SET hitlist=%%s WHERE term=%%s" % wash_table_column_name(self.table_name), (set.fastdump(), word)) # kwalitee: disable=sql + run_sql("UPDATE %s SET hitlist=_binary %%s WHERE term=%%s" % wash_table_column_name(self.table_name), (set.fastdump(), word)) # kwalitee: disable=sql else: # the word is new, will create new set: write_message("......... inserting hitlist for ``%s''" % \ word, verbose=9) set = intbitset(self.value[word].keys()) try: - run_sql("INSERT INTO %s (term, hitlist) VALUES (%%s, %%s)" % wash_table_column_name(self.table_name), (word, set.fastdump())) # kwalitee: disable=sql + run_sql("INSERT INTO %s (term, hitlist) VALUES (%%s, _binary %%s)" % wash_table_column_name(self.table_name), (word, set.fastdump())) # kwalitee: disable=sql except Exception, e: ## We send this exception to the admin only when is not ## already reparing the problem. register_exception(prefix="Error when putting the term '%s' into db (hitlist=%s): %s\n" % (repr(word), set, e), alert_admin=(task_get_option('cmd') != 'repair')) if not set: # never store empty words run_sql("DELETE FROM %s WHERE term=%%s" % wash_table_column_name(self.table_name), (word,)) # kwalitee: disable=sql def put(self, recID, word, sign): """Keeps track of changes done during indexing and stores these changes in memory for further use. Indexing process needs this information later while filling in the database. @param recID: recID of the record we want to update in memory @param word: word we want to update @param sing: sign of the word, 1 means keep this word in database, -1 remove word from database """ value = self.value try: if self.wash_index_terms: word = wash_index_term(word, self.wash_index_terms) if value.has_key(word): # the word 'word' exist already: update sign value[word][recID] = sign else: value[word] = {recID: sign} except Exception as e: write_message("Error: Cannot put word %s with sign %d for recID %s." % \ (word, sign, recID)) def load_old_recIDs(self, word): """Load existing hitlist for the word from the database index files.""" query = "SELECT hitlist FROM %s WHERE term=%%s" % self.table_name res = run_sql(query, (word, )) if res: return intbitset(res[0][0]) else: return None def merge_with_old_recIDs(self, word, set): """Merge the system numbers stored in memory (hash of recIDs with value +1 or -1 according to whether to add/delete them) with those stored in the database index and received in set universe of recIDs for the given word. Return False in case no change was done to SET, return True in case SET was changed. """ oldset = intbitset(set) set.update_with_signs(self.value[word]) return set != oldset def clean(self): "Cleans the cache." self.value = {} class VirtualIndexTable(AbstractIndexTable): """ There are two types of indexes: virtual and regular/normal. Check WordTable class for more on normal indexes. This class represents a single index table for virtual index (see also: AbstractIndexTable). Virtual index doesn't store its own terms, it accumulates terms from other indexes. Good example of virtual index is the global index which stores terms from title, abstract, keyword, author and so on. This class contains methods for indexing virtual indexes. See also: run_update() """ def __init__(self, index_name, table_type, table_prefix="", wash_index_terms=50): """ Creates VirtualIndexTable instance. @param index_name: name of the index we want to reindex @param table_type: words, pairs or phrases @param table_prefix: add "tmp_" if you want to reindex to temporary table """ AbstractIndexTable.__init__(self, index_name, table_type, table_prefix, wash_index_terms) self.mode = "normal" self.dependent_indexes = dict(get_virtual_index_building_blocks(self.index_id)) def set_reindex_mode(self): """ Sets reindex mode. VirtualIndexTable will remove all its content from database and use insert_index function to repopulate it. """ self.mode = "reindex" def run_update(self, flush=10000): """ Function starts all updating processes for virtual index. It will take all information about pending changes from database from queue tables (idxWORD/PAIR/PHRASExxQ), process them and trigger appropriate indexing functions. @param flush: how many records we will put in one go into database (at most); see also: opt_flush in WordTable class """ global chunksize if self.mode == "reindex": self.clean_database() for index_id, index_name in self.dependent_indexes.iteritems(): rng = get_records_range_for_index(index_id) flush_count = 0 if not rng: continue write_message('Virtual index: %s is being reindexed for %s index' % \ (self.index_name, index_name)) chunks = chunk_generator(rng) try: while True: task_sleep_now_if_required() chunk = chunks.next() self.insert_index(index_id, chunk[0], chunk[1]) flush_count = flush_count + chunk[1] - chunk[0] + 1 self.recIDs_in_mem.append(list(chunk)) if flush_count >= flush: flush_count = 0 self.put_into_db() except StopIteration: if flush_count > 0: self.put_into_db() self.clean_queue_table(index_name) else: for index_id, index_name in self.dependent_indexes.iteritems(): query = """SELECT id_bibrec_low, id_bibrec_high, mode FROM %s WHERE index_name=%%s ORDER BY runtime ASC""" % \ (self.table_name[:-1] + "Q") entries = self.remove_duplicates(run_sql(query, (index_name, ))) if entries: write_message('Virtual index: %s is being updated for %s index' % \ (self.index_name, index_name)) for entry in entries: operation = None recID_low, recID_high, mode = entry if mode == CFG_BIBINDEX_UPDATE_MODE["Update"]: operation = self.update_index elif mode == CFG_BIBINDEX_UPDATE_MODE["Remove"]: operation = self.remove_index elif mode == CFG_BIBINDEX_UPDATE_MODE["Insert"]: operation = self.insert_index flush_count = 0 chunks = chunk_generator([recID_low, recID_high]) try: while True: task_sleep_now_if_required() chunk = chunks.next() operation(index_id, chunk[0], chunk[1]) flush_count = flush_count + chunk[1] - chunk[0] + 1 self.recIDs_in_mem.append(list(chunk)) if flush_count >= flush: flush_count = 0 self.put_into_db() except StopIteration: if flush_count > 0: self.put_into_db() self.clean_queue_table(index_name) def retrieve_new_values_from_index(self, index_id, records_range): """ Retrieves new values from dependent index for specific range of records. @param index_id: id of the dependent index @param records_range: the smallest and the biggest id in the range: [id_low, id_high] """ tab_name = "idx" + self.table_type + ("%02d" % index_id) + "R" query = """SELECT id_bibrec, termlist FROM %s WHERE id_bibrec BETWEEN %%s AND %%s""" % tab_name new_regular_values = run_sql(query, (records_range[0], records_range[1])) if new_regular_values: zipped = zip(*new_regular_values) new_regular_values = dict(zip(zipped[0], map(deserialize_via_marshal, zipped[1]))) else: new_regular_values = dict() return new_regular_values def retrieve_old_values(self, records_range): """ Retrieves old values from database for this virtual index for specific records range. @param records_range: the smallest and the biggest id in the range: [id_low, id_high] """ virtual_tab_name = self.table_name[:-1] + "R" query = """SELECT id_bibrec, termlist FROM %s WHERE type='CURRENT' AND id_bibrec BETWEEN %%s AND %%s""" % virtual_tab_name old_virtual_values = run_sql(query, (records_range[0], records_range[1])) if old_virtual_values: zipped = zip(*old_virtual_values) old_virtual_values = dict(zip(zipped[0], map(deserialize_via_marshal, zipped[1]))) else: old_virtual_values = dict() return old_virtual_values def update_index(self, index_id, recID_low, recID_high): """ Updates the state of virtual index for records in range: recID_low, recID_high for index specified by index_id. Function stores terms in idxWORD/PAIR/PHRASExxR tables with prefixes for specific index, for example term 'ellis' from author index will be stored in reversed table as: '__author__ellis'. It allows fast operations on only part of terms @param index_id: id of the dependent index we want to remove @param recID_low: first recID from the range of considered recIDs @param recID_high: last recID from the range of considered recIDs """ index_name = self.dependent_indexes[index_id] update_cache_for_record = self.update_cache_for_record virtual_tab_name = self.table_name[:-1] + "R" # take new values new_regular_values = self.retrieve_new_values_from_index(index_id, [recID_low, recID_high]) # take old values old_virtual_values = self.retrieve_old_values([recID_low, recID_high]) # update reversed table for recID in xrange(recID_low, recID_high + 1): new_values = new_regular_values.get(recID) or [] old_values = old_virtual_values.get(recID) or [] to_serialize = update_cache_for_record(index_name, recID, old_values, new_values) if len(to_serialize) == 0: continue run_sql("""INSERT INTO %s (id_bibrec,termlist,type) - VALUES (%%s,%%s,'FUTURE')""" % \ + VALUES (%%s,_binary %%s,'FUTURE')""" % \ wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal(to_serialize))) # kwalitee: disable=sql try: - run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,%%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql + run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql except DatabaseError: pass def insert_index(self, index_id, recID_low, recID_high): """ Inserts terms from dependent index to virtual table without looking what's inside the virtual table and what terms are being added. It's faster than 'updating', but it can only be used when virtual table is free of terms from this dependent index. @param index_id: id of the dependent index we want to remove @param recID_low: first recID from the range of considered recIDs @param recID_high: last recID from the range of considered recIDs """ index_name = self.dependent_indexes[index_id] insert_to_cache_for_record = self.insert_to_cache_for_record virtual_tab_name = self.table_name[:-1] + "R" # take new values new_regular_values = self.retrieve_new_values_from_index(index_id, [recID_low, recID_high]) # take old values old_virtual_values = self.retrieve_old_values([recID_low, recID_high]) # update reversed table for recID in xrange(recID_low, recID_high + 1): new_values = new_regular_values.get(recID) or [] old_values = old_virtual_values.get(recID) or [] to_serialize = insert_to_cache_for_record(index_name, recID, old_values, new_values) if len(to_serialize) == 0: continue - run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,%%s,'FUTURE')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal(to_serialize))) # kwalitee: disable=sql + run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'FUTURE')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal(to_serialize))) # kwalitee: disable=sql try: - run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,%%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql + run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql except DatabaseError: pass def remove_index(self, index_id, recID_low, recID_high): """ Removes words found in dependent index from reversed table of virtual index. Updates the state of the memory (for future removal from forward table). Takes into account that given words can be found in more that one dependent index and it won't mark these words for the removal process. @param index_id: id of the dependent index we want to remove @param recID_low: first recID from the range of considered recIDs @param recID_high: last recID from the range of considered recIDs """ index_name = self.dependent_indexes[index_id] remove_from_cache_for_record = self.remove_from_cache_for_record virtual_tab_name = self.table_name[:-1] + "R" # take old values old_virtual_values = self.retrieve_old_values([recID_low, recID_high]) # update reversed table for recID in xrange(recID_low, recID_high + 1): old_values = old_virtual_values.get(recID) or [] to_serialize = remove_from_cache_for_record(index_name, recID, old_values) if len(to_serialize) == 0: continue - run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,%%s,'FUTURE')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal(to_serialize))) # kwalitee: disable=sql + run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'FUTURE')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal(to_serialize))) # kwalitee: disable=sql try: - run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,%%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql + run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql except DatabaseError: pass def update_cache_for_record(self, index_name, recID, old_values, new_values): """ Updates memory (cache) with information on what to remove/add/modify in forward table for specified record. It also returns new terms which should be indexed for given record. @param index_name: index name of dependent index @param recID: considered record @param old_values: all old values from all dependent indexes for this virtual index for recID @param new_values: new values from some dependent index which should be added """ prefix = make_prefix(index_name) put = self.put new_values_prefix = [prefix + term for term in new_values] part_values = [] tmp_old_values_prefix = [] # split old values from v.index into those with 'prefix' and those without for term in old_values: if term.startswith(prefix): term_without_prefix = re.sub(re_prefix, '', term) part_values.append(term_without_prefix) put(recID, term_without_prefix, -1) else: tmp_old_values_prefix.append(term) # remember not to remove words that occur more than once part_values = set(part_values) for value in tmp_old_values_prefix: term_without_prefix = re.sub(re_prefix, '', value) if term_without_prefix in part_values: put(recID, term_without_prefix, 1) for term_without_prefix in new_values: put(recID, term_without_prefix, 1) tmp_new_values_prefix = list(tmp_old_values_prefix) tmp_new_values_prefix.extend(new_values_prefix) return tmp_new_values_prefix def insert_to_cache_for_record(self, index_name, recID, old_values, new_values): """ Updates cache with terms which should be inserted to database. Used in insert_index function. See also: update_cache_for_record which is analogous for update_index function. """ prefix = make_prefix(index_name) append = old_values.append put = self.put for term in new_values: append(prefix + term) put(recID, term, 1) return old_values def remove_from_cache_for_record(self, index_name, recID, old_values): """ Updates information in cache with terms which should be removed from virtual table. Used in remove_index function. """ prefix = make_prefix(index_name) tmp_rest = [] tmp_removed = [] tmp_new_values = [] append_to_new = tmp_new_values.append append_to_rest = tmp_rest.append append_to_removed = tmp_removed.append put = self.put for term in old_values: if term.startswith(prefix): term_without_prefix = re.sub(re_prefix, '', term) append_to_removed(term_without_prefix) put(recID, term_without_prefix, -1) else: append_to_rest(re.sub(re_prefix, '', term)) append_to_new(term) to_remember = set(tmp_rest) & set(tmp_removed) for term_without_prefix in to_remember: put(recID, term_without_prefix, 1) return tmp_new_values def clean_database(self): """Removes all entries from corresponding tables in database""" query = """DELETE FROM %s""" % self.table_name run_sql(query) query = """DELETE FROM %s""" % self.table_name[:-1] + "R" run_sql(query) def clean_queue_table(self, index_name): """ Cleans queue table (i.e. idxWORD/PAIR/PHRASExxQ) for specific index. It means that function will remove all entries from db from queue table for this index. """ query = "DELETE FROM %s WHERE index_name='%s'" % \ (self.table_name[:-1].lstrip(self.table_prefix) + "Q", index_name) run_sql(query) def remove_duplicates(self, entries): """ Removes duplicates from a list of entries (taken from Queue table) in order to process a single command only once. Queue table may look like this: id (..) id_bibrec_low id_bibrec_high index_name mode ... 12 1 100 title update 13 1 100 title update We don't want to perform the same operation twice. First we want to squash the same commands into one. @param entries: list of entries taken from the database """ unique = set() return [entry for entry in entries if entry not in unique and not unique.add(entry)] def remove_dependent_index(self, index_name): """ Removes dependent index from this virtual index. It means removing all words from all records with prefix: __index_name__ from reversed table, and removing some of them from forward table if they don't appear in another dependent index. @param index_name: name of the dependent index to remove """ flush = 10000 dependent = self.dependent_indexes.values() if len(dependent) == 0: write_message("Specified index is not virtual...") return if index_name not in dependent: write_message("Dependent index already removed...") return index_id = get_index_id_from_index_name(index_name) records_range = get_records_range_for_index(index_id) write_message("Removing an index: %s" % index_name) if records_range: flush_count = 0 chunks = chunk_generator([records_range[0], records_range[1]]) try: while True: task_sleep_now_if_required() chunk = chunks.next() self.remove_index(index_id, chunk[0], chunk[1]) flush_count = flush_count + chunk[1] - chunk[0] + 1 self.recIDs_in_mem.append(chunk) if flush_count >= flush: flush_count = 0 self.put_into_db() except StopIteration: if flush_count > 0: self.put_into_db() class WordTable(AbstractIndexTable): """ This class represents a single index table of regular index (regular means it doesn't accumulates data from other indexes, but it takes data directly from metadata of records which are being indexed; for other type of index check: VirtualIndexTable). To start indexing process one need to invoke add_recIDs() method. For furher reading see description of this method. """ def __init__(self, index_name, table_type, table_prefix="", wash_index_terms=50): """Creates words table instance. @param index_name: the index name @param index_id: the index integer identificator @param fields_to_index: a list of fields to index @param table_type: type of the wordtable: Words, Pairs, Phrases @param table_prefix: prefix for table name, indexing will be performed on table: <>idx<>XXF @param wash_index_terms: do we wash index terms, and if yes (when >0), how many characters do we keep in the index terms; see max_char_length parameter of wash_index_term() """ AbstractIndexTable.__init__(self, index_name, table_type, table_prefix, wash_index_terms) self.tags = get_index_tags(index_name, virtual=False) self.nonmarc_tags = get_index_tags(index_name, virtual=False, tagtype="nonmarc") self.timestamp = datetime.now() self.virtual_indexes = get_index_virtual_indexes(self.index_id) self.virtual_index_update_mode = CFG_BIBINDEX_UPDATE_MODE["Update"] try: self.stemming_language = get_index_stemming_language(self.index_id) except KeyError: self.stemming_language = '' self.remove_stopwords = get_index_remove_stopwords(self.index_id) self.remove_html_markup = get_index_remove_html_markup(self.index_id) self.remove_latex_markup = get_index_remove_latex_markup(self.index_id) self.tokenizer = get_index_tokenizer(self.index_id)(self.stemming_language, self.remove_stopwords, self.remove_html_markup, self.remove_latex_markup) self.tokenizer_type = detect_tokenizer_type(self.tokenizer) self.default_tokenizer_function = self.tokenizer.get_tokenizing_function(table_type) self.special_tags = self._handle_special_tags() if self.stemming_language and self.table_name.startswith('idxWORD'): write_message('%s has stemming enabled, language %s' % (self.table_name, self.stemming_language)) def _handle_special_tags(self): """ Fills in a dict with special tags which always use the same tokenizer and this tokenizer is independent of index. """ special_tags = {} fields = self.tags + self.nonmarc_tags for tag in fields: if tag in CFG_BIBINDEX_SPECIAL_TAGS: for t in CFG_BIBINDEX_INDEX_TABLE_TYPE: if self.table_type == CFG_BIBINDEX_INDEX_TABLE_TYPE[t]: tokenizer_name = CFG_BIBINDEX_SPECIAL_TAGS[tag][t] tokenizer = _TOKENIZERS[tokenizer_name] instance = tokenizer(self.stemming_language, self.remove_stopwords, self.remove_html_markup, self.remove_latex_markup) special_tags[tag] = instance.get_tokenizing_function(self.table_type) break return special_tags def turn_off_virtual_indexes(self): """ Prevents from reindexing related virtual indexes. """ self.virtual_indexes = [] def turn_on_virtual_indexes(self): """ Turns on indexing related virtual indexes. """ self.virtual_indexes = get_index_virtual_indexes(self.index_id) def get_field(self, recID, tag): """Returns list of values of the MARC-21 'tag' fields for the record 'recID'.""" out = [] bibXXx = "bib" + tag[0] + tag[1] + "x" bibrec_bibXXx = "bibrec_" + bibXXx query = """SELECT value FROM %s AS b, %s AS bb WHERE bb.id_bibrec=%%s AND bb.id_bibxxx=b.id AND tag LIKE %%s""" % (bibXXx, bibrec_bibXXx) res = run_sql(query, (recID, tag)) for row in res: out.append(row[0]) return out def notify_virtual_indexes(self, recID_ranges): """ Informs all related virtual indexes about index change. Function leaves information about the change for each index in proper table in database (idxSOMETHINGxxQ). @param recID_ranges: low and high recIDs of ranges @type recID_ranges: list [[low_id1, high_id1], [low_id2, high_id2]...] """ query = """INSERT INTO %s (runtime, id_bibrec_low, id_bibrec_high, index_name, mode) VALUES (%%s, %%s, %%s, %%s, %%s)""" for index_id, index_name in self.virtual_indexes: tab_name = "idx%s%02dQ" % (self.table_type, index_id) full_query = query % tab_name for recID_range in recID_ranges: run_sql(full_query, (self.timestamp, recID_range[0], recID_range[1], self.index_name, self.virtual_index_update_mode)) def display(self): "Displays the word table." keys = self.value.keys() keys.sort() for k in keys: write_message("%s: %s" % (k, self.value[k])) def count(self): "Returns the number of words in the table." return len(self.value) def info(self): "Prints some information on the words table." write_message("The words table contains %d words." % self.count()) def lookup_words(self, word=""): "Lookup word from the words table." if not word: done = 0 while not done: try: word = raw_input("Enter word: ") done = 1 except (EOFError, KeyboardInterrupt): return if self.value.has_key(word): write_message("The word '%s' is found %d times." \ % (word, len(self.value[word]))) else: write_message("The word '%s' does not exist in the word file."\ % word) def add_recIDs(self, recIDs, opt_flush): """Fetches records which id in the recIDs range list and adds them to the wordTable. The recIDs range list is of the form: [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]]. """ global chunksize, _last_word_table flush_count = 0 records_done = 0 records_to_go = 0 for arange in recIDs: records_to_go = records_to_go + arange[1] - arange[0] + 1 time_started = time.time() # will measure profile time for arange in recIDs: i_low = arange[0] chunksize_count = 0 while i_low <= arange[1]: task_sleep_now_if_required() # calculate chunk group of recIDs and treat it: i_high = min(i_low + opt_flush - flush_count - 1, arange[1]) i_high = min(i_low + chunksize - chunksize_count - 1, i_high) try: self.chk_recID_range(i_low, i_high) except StandardError: if self.index_name == 'fulltext' and CFG_SOLR_URL: solr_commit() raise write_message(CFG_BIBINDEX_ADDING_RECORDS_STARTED_STR % \ (self.table_name, i_low, i_high)) if CFG_CHECK_MYSQL_THREADS: kill_sleepy_mysql_threads() percentage_display = get_percentage_completed(records_done, records_to_go) task_update_progress("(%s:%s) adding recs %d-%d %s" % (self.table_name, self.index_name, i_low, i_high, percentage_display)) self.del_recID_range(i_low, i_high) just_processed = self.add_recID_range(i_low, i_high) flush_count = flush_count + i_high - i_low + 1 chunksize_count = chunksize_count + i_high - i_low + 1 records_done = records_done + just_processed write_message(CFG_BIBINDEX_ADDING_RECORDS_STARTED_STR % \ (self.table_name, i_low, i_high)) if chunksize_count >= chunksize: chunksize_count = 0 # flush if necessary: if flush_count >= opt_flush: self.put_into_db() self.clean() if self.index_name == 'fulltext' and CFG_SOLR_URL: solr_commit() write_message("%s backing up" % (self.table_name)) flush_count = 0 self.log_progress(time_started, records_done, records_to_go) # iterate: i_low = i_high + 1 if flush_count > 0: self.put_into_db() if self.index_name == 'fulltext' and CFG_SOLR_URL: solr_commit() self.log_progress(time_started, records_done, records_to_go) self.notify_virtual_indexes(recIDs) def add_recID_range(self, recID1, recID2): """Add records from RECID1 to RECID2.""" wlist = {} self.recIDs_in_mem.append([recID1, recID2]) # special case of author indexes where we also add author # canonical IDs: if self.index_name in ('author', 'firstauthor', 'exactauthor', 'exactfirstauthor'): for recID in range(recID1, recID2 + 1): if not wlist.has_key(recID): wlist[recID] = [] wlist[recID] = list_union(get_author_canonical_ids_for_recid(recID), wlist[recID]) marc, nonmarc = self.find_nonmarc_records(recID1, recID2) if marc: collector = TermCollector(self.tokenizer, self.tokenizer_type, self.table_type, self.tags, [recID1, recID2]) collector.set_special_tags(self.special_tags) wlist = collector.collect(marc, wlist) if nonmarc: collector = NonmarcTermCollector(self.tokenizer, self.tokenizer_type, self.table_type, self.nonmarc_tags, [recID1, recID2]) collector.set_special_tags(self.special_tags) wlist = collector.collect(nonmarc, wlist) # lookup index-time synonyms: synonym_kbrs = get_all_synonym_knowledge_bases() if synonym_kbrs.has_key(self.index_name): if len(wlist) == 0: return 0 recIDs = wlist.keys() for recID in recIDs: for word in wlist[recID]: word_synonyms = get_synonym_terms(word, synonym_kbrs[self.index_name][0], synonym_kbrs[self.index_name][1], use_memoise=True) if word_synonyms: wlist[recID] = list_union(word_synonyms, wlist[recID]) # were there some words for these recIDs found? recIDs = wlist.keys() for recID in recIDs: # was this record marked as deleted? if "DELETED" in self.get_field(recID, "980__c"): wlist[recID] = [] write_message("... record %d was declared deleted, removing its word list" % recID, verbose=9) write_message("... record %d, termlist: %s" % (recID, wlist[recID]), verbose=9) if len(wlist) == 0: return 0 # put words into reverse index table with FUTURE status: for recID in recIDs: - run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,%%s,'FUTURE')" % wash_table_column_name(self.table_name[:-1]), (recID, serialize_via_marshal(wlist[recID]))) # kwalitee: disable=sql + run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'FUTURE')" % wash_table_column_name(self.table_name[:-1]), (recID, serialize_via_marshal(wlist[recID]))) # kwalitee: disable=sql # ... and, for new records, enter the CURRENT status as empty: try: - run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,%%s,'CURRENT')" % wash_table_column_name(self.table_name[:-1]), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql + run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'CURRENT')" % wash_table_column_name(self.table_name[:-1]), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql except DatabaseError: # okay, it's an already existing record, no problem pass # put words into memory word list: put = self.put for recID in recIDs: for w in wlist[recID]: put(recID, w, 1) return len(recIDs) def find_nonmarc_records(self, recID1, recID2): """Divides recID range into two different tables, first one contains only recIDs of the records that are Marc type and the second one contains records of nonMarc type""" marc = range(recID1, recID2 + 1) nonmarc = [] query = """SELECT id FROM %s WHERE master_format <> 'marc' AND id BETWEEN %%s AND %%s""" % "bibrec" res = run_sql(query, (recID1, recID2)) if res: nonmarc = list(zip(*res)[0]) if len(nonmarc) == (recID2 - recID1 + 1): nonmarc = xrange(recID1, recID2 + 1) marc = [] else: for recID in nonmarc: marc.remove(recID) else: marc = xrange(recID1, recID2 + 1) return [marc, nonmarc] def log_progress(self, start, done, todo): """Calculate progress and store it. start: start time, done: records processed, todo: total number of records""" time_elapsed = time.time() - start # consistency check if time_elapsed == 0 or done > todo: return time_recs_per_min = done / (time_elapsed / 60.0) write_message("%d records took %.1f seconds to complete.(%1.f recs/min)"\ % (done, time_elapsed, time_recs_per_min)) if time_recs_per_min: write_message("Estimated runtime: %.1f minutes" % \ ((todo - done) / time_recs_per_min)) def put(self, recID, word, sign): """Keeps track of changes done during indexing and stores these changes in memory for further use. Indexing process needs this information later while filling in the database. @param recID: recID of the record we want to update in memory @param word: word we want to update @param sing: sign of the word, 1 means keep this word in database, -1 remove word from database """ value = self.value try: if self.wash_index_terms: word = wash_index_term(word, self.wash_index_terms) if value.has_key(word): # the word 'word' exist already: update sign value[word][recID] = sign else: value[word] = {recID: sign} except: write_message("Error: Cannot put word %s with sign %d for recID %s." % (word, sign, recID)) def del_recIDs(self, recIDs): """Fetches records which id in the recIDs range list and adds them to the wordTable. The recIDs range list is of the form: [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]]. """ count = 0 for arange in recIDs: task_sleep_now_if_required() self.del_recID_range(arange[0], arange[1]) count = count + arange[1] - arange[0] self.virtual_index_update_mode = CFG_BIBINDEX_UPDATE_MODE["Remove"] self.put_into_db() self.notify_virtual_indexes(recIDs) if self.index_name == 'fulltext' and CFG_SOLR_URL: solr_commit() def del_recID_range(self, low, high): """Deletes records with 'recID' system number between low and high from memory words index table.""" write_message("%s fetching existing words for records #%d-#%d started" % \ (self.table_name, low, high), verbose=3) self.recIDs_in_mem.append([low, high]) query = """SELECT id_bibrec,termlist FROM %sR as bb WHERE bb.id_bibrec BETWEEN %%s AND %%s""" % (self.table_name[:-1]) recID_rows = run_sql(query, (low, high)) for recID_row in recID_rows: recID = recID_row[0] wlist = deserialize_via_marshal(recID_row[1]) for word in wlist: self.put(recID, word, -1) write_message("%s fetching existing words for records #%d-#%d ended" % \ (self.table_name, low, high), verbose=3) def check_bad_words(self): """ Finds bad words in reverse tables. Returns True in case of bad words. """ query = """SELECT 1 FROM %sR WHERE type IN ('TEMPORARY','FUTURE') LIMIT 1""" \ % (self.table_name[:-1],) res = run_sql(query) return bool(res) def report_on_table_consistency(self): """Check reverse words index tables (e.g. idxWORD01R) for interesting states such as 'TEMPORARY' state. Prints small report (no of words, no of bad words). """ # find number of words: query = """SELECT COUNT(1) FROM %s""" % (self.table_name) res = run_sql(query, None, 1) if res: nb_words = res[0][0] else: nb_words = 0 # report stats: write_message("%s contains %d words" % (self.table_name, nb_words)) # find possible bad states in reverse tables: if self.check_bad_words(): write_message("EMERGENCY: %s needs to be repaired" % (self.table_name, )) else: write_message("%s is in consistent state" % (self.table_name)) def repair(self, opt_flush): """Repair the whole table""" # find possible bad states in reverse tables: if not self.check_bad_words(): return query = """SELECT id_bibrec FROM %sR WHERE type IN ('TEMPORARY','FUTURE')""" \ % (self.table_name[:-1]) res = intbitset(run_sql(query)) recIDs = create_range_list(list(res)) flush_count = 0 records_done = 0 records_to_go = 0 for arange in recIDs: records_to_go = records_to_go + arange[1] - arange[0] + 1 time_started = time.time() # will measure profile time for arange in recIDs: i_low = arange[0] chunksize_count = 0 while i_low <= arange[1]: task_sleep_now_if_required() # calculate chunk group of recIDs and treat it: i_high = min(i_low + opt_flush - flush_count - 1, arange[1]) i_high = min(i_low + chunksize - chunksize_count - 1, i_high) self.fix_recID_range(i_low, i_high) flush_count = flush_count + i_high - i_low + 1 chunksize_count = chunksize_count + i_high - i_low + 1 records_done = records_done + i_high - i_low + 1 if chunksize_count >= chunksize: chunksize_count = 0 # flush if necessary: if flush_count >= opt_flush: self.put_into_db("emergency") self.clean() flush_count = 0 self.log_progress(time_started, records_done, records_to_go) # iterate: i_low = i_high + 1 if flush_count > 0: self.put_into_db("emergency") self.log_progress(time_started, records_done, records_to_go) write_message("%s inconsistencies repaired." % self.table_name) def chk_recID_range(self, low, high): """Check if the reverse index table is in proper state""" ## check db query = """SELECT 1 FROM %sR WHERE type IN ('TEMPORARY','FUTURE') AND id_bibrec BETWEEN %%s AND %%s LIMIT 1""" % self.table_name[:-1] res = run_sql(query, (low, high), 1) if not res: write_message("%s for %d-%d is in consistent state" % (self.table_name, low, high)) return # okay, words table is consistent ## inconsistency detected! write_message("EMERGENCY: %s inconsistencies detected..." % self.table_name) error_message = "Errors found. You should check consistency of the " \ "%s - %sR tables.\nRunning 'bibindex --repair' is " \ "recommended." % (self.table_name, self.table_name[:-1]) write_message("EMERGENCY: " + error_message, stream=sys.stderr) raise StandardError(error_message) def fix_recID_range(self, low, high): """Try to fix reverse index database consistency (e.g. table idxWORD01R) in the low,high doc-id range. Possible states for a recID follow: CUR TMP FUT: very bad things have happened: warn! CUR TMP : very bad things have happened: warn! CUR FUT: delete FUT (crash before flushing) CUR : database is ok TMP FUT: add TMP to memory and del FUT from memory flush (revert to old state) TMP : very bad things have happened: warn! FUT: very bad things have happended: warn! """ state = {} query = "SELECT id_bibrec,type FROM %sR WHERE id_bibrec BETWEEN %%s AND %%s"\ % self.table_name[:-1] res = run_sql(query, (low, high)) for row in res: if not state.has_key(row[0]): state[row[0]] = [] state[row[0]].append(row[1]) ok = 1 # will hold info on whether we will be able to repair for recID in state.keys(): if not 'TEMPORARY' in state[recID]: if 'FUTURE' in state[recID]: if 'CURRENT' not in state[recID]: write_message("EMERGENCY: Index record %d is in inconsistent state. Can't repair it." % recID) ok = 0 else: write_message("EMERGENCY: Inconsistency in index record %d detected" % recID) query = """DELETE FROM %sR WHERE id_bibrec=%%s""" % self.table_name[:-1] run_sql(query, (recID,)) write_message("EMERGENCY: Inconsistency in record %d repaired." % recID) else: if 'FUTURE' in state[recID] and not 'CURRENT' in state[recID]: self.recIDs_in_mem.append([recID, recID]) # Get the words file query = """SELECT type,termlist FROM %sR WHERE id_bibrec=%%s""" % self.table_name[:-1] write_message(query, verbose=9) res = run_sql(query, (recID,)) for row in res: wlist = deserialize_via_marshal(row[1]) write_message("Words are %s " % wlist, verbose=9) if row[0] == 'TEMPORARY': sign = 1 else: sign = -1 for word in wlist: self.put(recID, word, sign) else: write_message("EMERGENCY: %s for %d is in inconsistent " "state. Couldn't repair it." % (self.table_name, recID), stream=sys.stderr) ok = 0 if not ok: error_message = "Unrepairable errors found. You should check " \ "consistency of the %s - %sR tables. Deleting affected " \ "TEMPORARY and FUTURE entries from these tables is " \ "recommended; see the BibIndex Admin Guide." % \ (self.table_name, self.table_name[:-1]) write_message("EMERGENCY: " + error_message, stream=sys.stderr) raise StandardError(error_message) def main(): """Main that construct all the bibtask.""" task_init(authorization_action='runbibindex', authorization_msg="BibIndex Task Submission", description="""Examples: \t%s -a -i 234-250,293,300-500 -u admin@localhost \t%s -a -w author,fulltext -M 8192 -v3 \t%s -d -m +4d -A on --flush=10000\n""" % ((sys.argv[0],) * 3), help_specific_usage=""" Indexing options: -a, --add\t\tadd or update words for selected records -d, --del\t\tdelete words for selected records -i, --id=low[-high]\t\tselect according to doc recID -m, --modified=from[,to]\tselect according to modification date -c, --collection=c1[,c2]\tselect according to collection -R, --reindex\treindex the selected indexes from scratch Repairing options: -k, --check\t\tcheck consistency for all records in the table(s) -r, --repair\t\ttry to repair all records in the table(s) Specific options: -w, --windex=w1[,w2]\tword/phrase indexes to consider (all) -M, --maxmem=XXX\tmaximum memory usage in kB (no limit) -f, --flush=NNN\t\tfull consistent table flush after NNN records (10000) --force\t\tforce indexing of all records for provided indexes -Z, --remove-dependent-index=w name of an index for removing from virtual index -l --all-virtual\t\t set of all virtual indexes; the same as: -w virtual_ind1, virtual_ind2, ... """, version=__revision__, specific_params=("adi:m:c:w:krRM:f:oZ:l", [ "add", "del", "id=", "modified=", "collection=", "windex=", "check", "repair", "reindex", "maxmem=", "flush=", "force", "remove-dependent-index=", "all-virtual" ]), task_stop_helper_fnc=task_stop_table_close_fnc, task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter, task_run_fnc=task_run_core, task_submit_check_options_fnc=task_submit_check_options) def task_submit_check_options(): """Check for options compatibility.""" if task_get_option("reindex"): if task_get_option("cmd") != "add" or task_get_option('id') or task_get_option('collection'): print >> sys.stderr, "ERROR: You can use --reindex only when adding modified record." return False return True def task_submit_elaborate_specific_parameter(key, value, opts, args): """ Given the string key it checks it's meaning, eventually using the value. Usually it fills some key in the options dict. It must return True if it has elaborated the key, False, if it doesn't know that key. eg: if key in ['-n', '--number']: self.options['number'] = value return True return False """ if key in ("-a", "--add"): task_set_option("cmd", "add") if ("-x", "") in opts or ("--del", "") in opts: raise StandardError("Can not have --add and --del at the same time!") elif key in ("-k", "--check"): task_set_option("cmd", "check") elif key in ("-r", "--repair"): task_set_option("cmd", "repair") elif key in ("-d", "--del"): task_set_option("cmd", "del") elif key in ("-i", "--id"): task_set_option('id', task_get_option('id') + split_ranges(value)) elif key in ("-m", "--modified"): task_set_option("modified", get_date_range(value)) elif key in ("-c", "--collection"): task_set_option("collection", value) elif key in ("-R", "--reindex"): task_set_option("reindex", True) elif key in ("-w", "--windex"): task_set_option("windex", value) elif key in ("-M", "--maxmem"): task_set_option("maxmem", int(value)) if task_get_option("maxmem") < base_process_size + 1000: raise StandardError("Memory usage should be higher than %d kB" % \ (base_process_size + 1000)) elif key in ("-f", "--flush"): task_set_option("flush", int(value)) elif key in ("-o", "--force"): task_set_option("force", True) elif key in ("-Z", "--remove-dependent-index",): task_set_option("remove-dependent-index", value) elif key in ("-l", "--all-virtual",): task_set_option("all-virtual", True) else: return False return True def task_stop_table_close_fnc(): """ Close tables to STOP. """ global _last_word_table if _last_word_table: _last_word_table.put_into_db() def get_recIDs_by_date_bibliographic(dates, index_name, force_all=False): """ Finds records that were modified between DATES[0] and DATES[1] for given index. If DATES is not set, then finds records that were modified since the last update of the index. @param wordtable_type: can be 'Words', 'Pairs' or 'Phrases' """ index_id = get_index_id_from_index_name(index_name) if not dates: query = """SELECT last_updated FROM idxINDEX WHERE id=%s""" res = run_sql(query, (index_id,)) if not res: return set([]) if not res[0][0] or force_all: dates = ("0000-00-00", None) else: dates = (res[0][0], None) if dates[1] is None: res = intbitset(run_sql("""SELECT b.id FROM bibrec AS b WHERE b.modification_date >= %s""", (dates[0],))) if index_name == 'fulltext': res |= intbitset(run_sql("""SELECT id_bibrec FROM bibrec_bibdoc JOIN bibdoc ON id_bibdoc=id WHERE text_extraction_date <= modification_date AND modification_date >= %s AND status<>'DELETED'""", (dates[0],))) elif dates[0] is None: res = intbitset(run_sql("""SELECT b.id FROM bibrec AS b WHERE b.modification_date <= %s""", (dates[1],))) if index_name == 'fulltext': res |= intbitset(run_sql("""SELECT id_bibrec FROM bibrec_bibdoc JOIN bibdoc ON id_bibdoc=id WHERE text_extraction_date <= modification_date AND modification_date <= %s AND status<>'DELETED'""", (dates[1],))) else: res = intbitset(run_sql("""SELECT b.id FROM bibrec AS b WHERE b.modification_date >= %s AND b.modification_date <= %s""", (dates[0], dates[1]))) if index_name == 'fulltext': res |= intbitset(run_sql("""SELECT id_bibrec FROM bibrec_bibdoc JOIN bibdoc ON id_bibdoc=id WHERE text_extraction_date <= modification_date AND modification_date >= %s AND modification_date <= %s AND status<>'DELETED'""", (dates[0], dates[1],))) # special case of author indexes where we need to re-index # those records that were affected by changed BibAuthorID attributions: if index_name in ('author', 'firstauthor', 'exactauthor', 'exactfirstauthor'): from invenio.bibauthorid_personid_maintenance import get_recids_affected_since # dates[1] is ignored, since BibAuthorID API does not offer upper limit search rec_list_author = intbitset(get_recids_affected_since(dates[0])) res = res | rec_list_author return set(res) def get_recIDs_by_date_authority(dates, index_name, force_all=False): """ Finds records that were modified between DATES[0] and DATES[1] for given index. If DATES is not set, then finds records that were modified since the last update of the index. Searches for bibliographic records connected to authority records that have been changed. """ index_id = get_index_id_from_index_name(index_name) index_tags = get_index_tags(index_name) if not dates: query = """SELECT last_updated FROM idxINDEX WHERE id=%s""" res = run_sql(query, (index_id,)) if not res: return set([]) if not res[0][0] or force_all: dates = ("0000-00-00", None) else: dates = (res[0][0], None) res = intbitset() for tag in index_tags: pattern = tag.replace('%', '*') matches = fnmatch.filter(CFG_BIBAUTHORITY_CONTROLLED_FIELDS_BIBLIOGRAPHIC.keys(), pattern) if not len(matches): continue for tag_match in matches: # get the type of authority record associated with this field auth_type = CFG_BIBAUTHORITY_CONTROLLED_FIELDS_BIBLIOGRAPHIC.get(tag_match) # find updated authority records of this type # dates[1] is ignored, needs dates[0] to find res now = datetime.now() auth_recIDs = search_pattern(p='980__a:' + auth_type) \ & search_unit_in_bibrec(str(dates[0]), str(now), search_type='m') # now find dependent bibliographic records for auth_recID in auth_recIDs: # get the fix authority identifier of this authority record control_nos = get_control_nos_from_recID(auth_recID) # there may be multiple control number entries! (the '035' field is repeatable!) for control_no in control_nos: # get the bibrec IDs that refer to AUTHORITY_ID in TAG tag_0 = tag_match[:5] + '0' # possibly do the same for '4' subfields ? fieldvalue = '"' + control_no + '"' res |= search_pattern(p=tag_0 + ':' + fieldvalue) return set(res) def get_not_updated_recIDs(modified_dates, indexes, force_all=False): """Finds not updated recIDs in database for indexes. @param modified_dates: between this dates we should look for modified records @type modified_dates: [date_old, date_new] @param indexes: list of indexes @type indexes: string separated by coma @param force_all: if True all records will be taken """ found_recIDs = set() write_message(CFG_BIBINDEX_UPDATE_MESSAGE) for index in indexes: found_recIDs |= get_recIDs_by_date_bibliographic(modified_dates, index, force_all) found_recIDs |= get_recIDs_by_date_authority(modified_dates, index, force_all) return list(sorted(found_recIDs)) def get_recIDs_from_cli(indexes=[]): """ Gets recIDs ranges from CLI for indexing when user specified 'id' or 'collection' option or search for modified recIDs for provided indexes when recIDs are not specified. @param indexes: it's a list of specified indexes, which can be obtained from CLI with use of: get_indexes_from_cli() function. @type indexes: list of strings """ # need to first update idxINDEX table to find proper recIDs for reindexing if task_get_option("reindex"): for index_name in indexes: run_sql("""UPDATE idxINDEX SET last_updated='0000-00-00 00:00:00' WHERE name=%s""", (index_name,)) if task_get_option("id"): return task_get_option("id") elif task_get_option("collection"): l_of_colls = task_get_option("collection").split(",") recIDs = perform_request_search(c=l_of_colls) recIDs_range = [] for recID in recIDs: recIDs_range.append([recID, recID]) return recIDs_range elif task_get_option("cmd") == "add": recs = get_not_updated_recIDs(task_get_option("modified"), indexes, task_get_option("force")) recIDs_range = beautify_range_list(create_range_list(recs)) return recIDs_range return [] def get_indexes_from_cli(): """ Gets indexes from CLI and checks if they are valid. If indexes weren't specified function will return all known indexes. """ indexes = task_get_option("windex") all_virtual = task_get_option("all-virtual") if all_virtual: indexes = filter_for_virtual_indexes(get_all_indexes()) elif not indexes: indexes = get_all_indexes() else: indexes = indexes.split(",") indexes = remove_inexistent_indexes(indexes, leave_virtual=True) return indexes def remove_dependent_index(virtual_indexes, dependent_index): """ Removes dependent index from virtual indexes. @param virtual_indexes: names of virtual_indexes @type virtual_indexes: list of strings @param dependent_index: name of dependent index @type dependent_index: string """ if not virtual_indexes: write_message("You should specify a name of a virtual index...") return id_dependent = get_index_id_from_index_name(dependent_index) for index_name in virtual_indexes: index_id = get_index_id_from_index_name(index_name) for type_ in CFG_BIBINDEX_INDEX_TABLE_TYPE.itervalues(): vit = VirtualIndexTable(index_name, type_) vit.remove_dependent_index(dependent_index) task_sleep_now_if_required() query = """DELETE FROM idxINDEX_idxINDEX WHERE id_virtual=%s AND id_normal=%s""" run_sql(query, (index_id, id_dependent)) def should_update_virtual_indexes(): """ Decides if any virtual indexes should be updated. Decision is made based on arguments obtained from CLI. """ return task_get_option("all-virtual") or task_get_option("windex") def update_virtual_indexes(virtual_indexes, reindex=False): """ Function will update all specified virtual_indexes. @param virtual_indexes: list of index names @param reindex: shall we reindex given v.indexes from scratch? """ kwargs = {} if reindex: kwargs.update({'table_prefix': 'tmp_'}) for index_name in virtual_indexes: if reindex: index_id = get_index_id_from_index_name(index_name) init_temporary_reindex_tables(index_id) for key, type_ in CFG_BIBINDEX_INDEX_TABLE_TYPE.iteritems(): kwargs.update({'wash_index_terms': CFG_BIBINDEX_WASH_INDEX_TERMS[key]}) vit = VirtualIndexTable(index_name, type_, **kwargs) vit.set_reindex_mode() vit.run_update() swap_temporary_reindex_tables(index_id) update_index_last_updated([index_name], task_get_task_param('task_starting_time')) task_sleep_now_if_required(can_stop_too=True) else: for key, type_ in CFG_BIBINDEX_INDEX_TABLE_TYPE.iteritems(): kwargs.update({'wash_index_terms': CFG_BIBINDEX_WASH_INDEX_TERMS[key]}) vit = VirtualIndexTable(index_name, type_, **kwargs) vit.run_update() task_sleep_now_if_required(can_stop_too=True) def task_run_core(): """Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call. """ global _last_word_table indexes = get_indexes_from_cli() if len(indexes) == 0: write_message("Specified indexes can't be found.") return True virtual_indexes = filter_for_virtual_indexes(indexes) regular_indexes = list(set(indexes) - set(virtual_indexes)) # check tables consistency if task_get_option("cmd") == "check": for index_name in indexes: wordTable = WordTable(index_name=index_name, table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"], wash_index_terms=50) _last_word_table = wordTable wordTable.report_on_table_consistency() task_sleep_now_if_required(can_stop_too=True) wordTable = WordTable(index_name=index_name, table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Pairs"], wash_index_terms=100) _last_word_table = wordTable wordTable.report_on_table_consistency() task_sleep_now_if_required(can_stop_too=True) wordTable = WordTable(index_name=index_name, table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Phrases"], wash_index_terms=0) _last_word_table = wordTable wordTable.report_on_table_consistency() task_sleep_now_if_required(can_stop_too=True) _last_word_table = None return True # virtual index: remove dependent index if task_get_option("remove-dependent-index"): remove_dependent_index(indexes, task_get_option("remove-dependent-index")) return True # virtual index: update if should_update_virtual_indexes(): update_virtual_indexes(virtual_indexes, task_get_option("reindex")) if len(regular_indexes) == 0: return True # regular index: initialization for Words,Pairs,Phrases recIDs_range = get_recIDs_from_cli(regular_indexes) recIDs_for_index = find_affected_records_for_index(regular_indexes, recIDs_range, (task_get_option("force") or \ task_get_option("reindex") or \ task_get_option("cmd") == "del")) if len(recIDs_for_index.keys()) == 0: write_message("Selected indexes/recIDs are up to date.") # Let's work on single words! for index_name in recIDs_for_index.keys(): index_id = get_index_id_from_index_name(index_name) reindex_prefix = "" if task_get_option("reindex"): reindex_prefix = "tmp_" init_temporary_reindex_tables(index_id, reindex_prefix) wordTable = WordTable(index_name=index_name, table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"], table_prefix=reindex_prefix, wash_index_terms=50) _last_word_table = wordTable wordTable.report_on_table_consistency() try: if task_get_option("cmd") == "del": if task_get_option("id") or task_get_option("collection"): wordTable.del_recIDs(recIDs_range) task_sleep_now_if_required(can_stop_too=True) else: error_message = "Missing IDs of records to delete from " \ "index %s." % wordTable.table_name write_message(error_message, stream=sys.stderr) raise StandardError(error_message) elif task_get_option("cmd") == "add": final_recIDs = beautify_range_list(create_range_list(recIDs_for_index[index_name])) wordTable.add_recIDs(final_recIDs, task_get_option("flush")) task_sleep_now_if_required(can_stop_too=True) elif task_get_option("cmd") == "repair": wordTable.repair(task_get_option("flush")) task_sleep_now_if_required(can_stop_too=True) else: error_message = "Invalid command found processing %s" % \ wordTable.table_name write_message(error_message, stream=sys.stderr) raise StandardError(error_message) except StandardError, e: write_message("Exception caught: %s" % e, sys.stderr) register_exception(alert_admin=True) if _last_word_table: _last_word_table.put_into_db() raise wordTable.report_on_table_consistency() task_sleep_now_if_required(can_stop_too=True) # Let's work on pairs now wordTable = WordTable(index_name=index_name, table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Pairs"], table_prefix=reindex_prefix, wash_index_terms=100) _last_word_table = wordTable wordTable.report_on_table_consistency() try: if task_get_option("cmd") == "del": if task_get_option("id") or task_get_option("collection"): wordTable.del_recIDs(recIDs_range) task_sleep_now_if_required(can_stop_too=True) else: error_message = "Missing IDs of records to delete from " \ "index %s." % wordTable.table_name write_message(error_message, stream=sys.stderr) raise StandardError(error_message) elif task_get_option("cmd") == "add": final_recIDs = beautify_range_list(create_range_list(recIDs_for_index[index_name])) wordTable.add_recIDs(final_recIDs, task_get_option("flush")) task_sleep_now_if_required(can_stop_too=True) elif task_get_option("cmd") == "repair": wordTable.repair(task_get_option("flush")) task_sleep_now_if_required(can_stop_too=True) else: error_message = "Invalid command found processing %s" % \ wordTable.table_name write_message(error_message, stream=sys.stderr) raise StandardError(error_message) except StandardError, e: write_message("Exception caught: %s" % e, sys.stderr) register_exception() if _last_word_table: _last_word_table.put_into_db() raise wordTable.report_on_table_consistency() task_sleep_now_if_required(can_stop_too=True) # Let's work on phrases now wordTable = WordTable(index_name=index_name, table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Phrases"], table_prefix=reindex_prefix, wash_index_terms=0) _last_word_table = wordTable wordTable.report_on_table_consistency() try: if task_get_option("cmd") == "del": if task_get_option("id") or task_get_option("collection"): wordTable.del_recIDs(recIDs_range) task_sleep_now_if_required(can_stop_too=True) else: error_message = "Missing IDs of records to delete from " \ "index %s." % wordTable.table_name write_message(error_message, stream=sys.stderr) raise StandardError(error_message) elif task_get_option("cmd") == "add": final_recIDs = beautify_range_list(create_range_list(recIDs_for_index[index_name])) wordTable.add_recIDs(final_recIDs, task_get_option("flush")) if not task_get_option("id") and not task_get_option("collection"): update_index_last_updated([index_name], task_get_task_param('task_starting_time')) task_sleep_now_if_required(can_stop_too=True) elif task_get_option("cmd") == "repair": wordTable.repair(task_get_option("flush")) task_sleep_now_if_required(can_stop_too=True) else: error_message = "Invalid command found processing %s" % \ wordTable.table_name write_message(error_message, stream=sys.stderr) raise StandardError(error_message) except StandardError, e: write_message("Exception caught: %s" % e, sys.stderr) register_exception() if _last_word_table: _last_word_table.put_into_db() raise wordTable.report_on_table_consistency() task_sleep_now_if_required(can_stop_too=True) if task_get_option("reindex"): swap_temporary_reindex_tables(index_id, reindex_prefix) update_index_last_updated([index_name], task_get_task_param('task_starting_time')) task_sleep_now_if_required(can_stop_too=True) # update modification date also for indexes that were up to date if not task_get_option("id") and not task_get_option("collection") and \ task_get_option("cmd") == "add": up_to_date = set(indexes) - set(recIDs_for_index.keys()) update_index_last_updated(list(up_to_date), task_get_task_param('task_starting_time')) _last_word_table = None return True ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/bibrank/lib/bibrank_citation_indexer.py b/modules/bibrank/lib/bibrank_citation_indexer.py index 1a11c14a4..a5bd6b3e9 100644 --- a/modules/bibrank/lib/bibrank_citation_indexer.py +++ b/modules/bibrank/lib/bibrank_citation_indexer.py @@ -1,1288 +1,1288 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 CERN. +# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. __revision__ = "$Id$" import re import time import os import sys import ConfigParser from datetime import datetime from itertools import islice from invenio.intbitset import intbitset from invenio.dbquery import run_sql from invenio.bibindex_tokenizers.BibIndexJournalTokenizer import \ CFG_JOURNAL_PUBINFO_STANDARD_FORM, \ CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK from invenio.redisutils import get_redis from invenio.search_engine import search_pattern, \ search_unit, \ get_collection_reclist from invenio.bibformat_utils import parse_tag from invenio.bibknowledge import get_kb_mappings from invenio.bibtask import write_message, task_get_option, \ task_update_progress, task_sleep_now_if_required, \ task_get_task_param from invenio.bibindex_engine_utils import get_field_tags from invenio.docextract_record import get_record from invenio.dbquery import serialize_via_marshal re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK \ = re.compile(CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK) def compute_weights(): sql = "SELECT citee, COUNT(citer) FROM rnkCITATIONDICT GROUP BY citee" weights = {} for citee, c in run_sql(sql): weights[citee] = c return weights def recids_cache(collections, cache={}): if 'valid_recids' not in cache: cache['valid_recids'] = intbitset() for coll in collections.split(','): cache['valid_recids'] += get_collection_reclist(coll) return cache['valid_recids'] def deleted_recids_cache(cache={}): if 'deleted_records' not in cache: cache['deleted_records'] = search_unit(p='DELETED', f='980', m='a') return cache['deleted_records'] def get_recids_matching_query(p, f, config, m='e'): """Return set of recIDs matching query for pattern p in field f. @param p: pattern to search for @type recID: unicode string @param f: field to search in @type recID: unicode string @param config: bibrank configuration @type recID: dict @param m: type of matching (usually 'e' for exact or 'r' for regexp) @type recID: string """ p = p.encode('utf-8') f = f.encode('utf-8') function = config.get("rank_method", "function") collections = config.get(function, 'collections') if collections: ret = search_pattern(p=p, f=f, m=m) & recids_cache(collections) else: ret = search_pattern(p=p, f=f, m=m) - deleted_recids_cache() return ret def get_citation_weight(rank_method_code, config, chunk_size=25000): """return a dictionary which is used by bibrank daemon for generating the index of sorted research results by citation information """ quick = task_get_option("quick") != "no" # id option forces re-indexing a certain range # even if there are no new recs if task_get_option("id"): # construct a range of records to index updated_recids = [] for first, last in task_get_option("id"): updated_recids += range(first, last+1) if len(updated_recids) > 10000: str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:]) else: str_updated_recids = str(updated_recids) write_message('Records to process: %s' % str_updated_recids) index_update_time = None else: bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code) if not quick: bibrank_update_time = "0000-00-00 00:00:00" write_message("bibrank: %s" % bibrank_update_time) index_update_time = get_bibindex_update_time() write_message("bibindex: %s" % index_update_time) if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"): index_update_time = "0000-00-00 00:00:00" updated_recids = get_modified_recs(bibrank_update_time, index_update_time) if len(updated_recids) > 10000: str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:]) else: str_updated_recids = str(updated_recids) write_message("%s records to update" % str_updated_recids) if updated_recids: begin_time = time.time() try: function = config.get("rank_method", "function") config.get(function, 'collections') except ConfigParser.NoOptionError: config.set(function, 'collections', None) # Process fully the updated records weights = process_and_store(updated_recids, config, chunk_size) end_time = time.time() write_message("Total time of get_citation_weight(): %.2f sec" % (end_time - begin_time)) task_update_progress("citation analysis done") else: weights = None write_message("No new records added since last time this " "rank method was executed") return weights, index_update_time def process_and_store(recids, config, chunk_size): # Limit of # of citation we can loose in one chunk function = config.get("rank_method", "function") citation_loss_limit = int(config.get(function, "citation_loss_limit")) # If we have nothing to process # Do not update the weights dictionary modified = False # Process recent records first # The older records were most likely added by the above steps # to be reprocessed so they only have minor changes recids_iter = iter(sorted(recids, reverse=True)) # Split records to process into chunks so that we do not # fill up too much memory while True: task_sleep_now_if_required() chunk = list(islice(recids_iter, chunk_size)) if not chunk: break write_message("Processing chunk #%s to #%s" % (chunk[0], chunk[-1])) # The core work cites, refs = process_chunk(chunk, config) # Check that we haven't lost too many citations cites_diff = compute_dicts_diff(chunk, refs, cites) write_message("Citations balance %s" % cites_diff) if citation_loss_limit and cites_diff <= -citation_loss_limit: raise Exception('Lost too many references, aborting') # Store processed citations/references store_dicts(chunk, refs, cites) modified = True # Compute new weights dictionary if modified: weights = compute_weights() else: weights = None store_weights_cache(weights) return weights def store_weights_cache(weights): """Store into key/value store""" redis = get_redis() redis.set('citations_weights', serialize_via_marshal(weights)) def process_chunk(recids, config): tags = get_tags_config(config) # call the procedure that does the hard work by reading fields of # citations and references in the updated_recid's (but nothing else)! write_message("Entering get_citation_informations", verbose=9) citation_informations = get_citation_informations(recids, tags, config) write_message("Entering ref_analyzer", verbose=9) # call the analyser that uses the citation_informations to really # search x-cites-y in the coll.. return ref_analyzer(citation_informations, recids, tags, config) def get_bibrankmethod_lastupdate(rank_method_code): """Return the last excution date of bibrank method """ query = """SELECT DATE_FORMAT(last_updated, '%%Y-%%m-%%d %%H:%%i:%%s') FROM rnkMETHOD WHERE name =%s""" last_update_time = run_sql(query, [rank_method_code]) try: r = last_update_time[0][0] except IndexError: r = "0000-00-00 00:00:00" return r def get_bibindex_update_time(): """Return the last indexing date of the journals and report number indexes """ try: # check indexing times of `journal' and `reportnumber` # indexes, and only fetch records which have been indexed sql = "SELECT DATE_FORMAT(MIN(last_updated), " \ "'%%Y-%%m-%%d %%H:%%i:%%s') FROM idxINDEX WHERE name IN (%s,%s)" index_update_time = run_sql(sql, ('journal', 'reportnumber'), 1)[0][0] except IndexError: write_message("Not running citation indexer since journal/reportnumber" " indexes are not created yet.") index_update_time = "0000-00-00 00:00:00" return index_update_time def get_modified_recs(bibrank_method_lastupdate, indexes_lastupdate): """Get records to be updated by bibrank indexing Return the list of records which have been modified between the last execution of bibrank method and the latest journal/report index updates. The result is expected to have ascending id order. """ query = """SELECT id FROM bibrec WHERE modification_date >= %s AND modification_date < %s ORDER BY id ASC""" records = run_sql(query, (bibrank_method_lastupdate, indexes_lastupdate)) return [r[0] for r in records] def format_journal(format_string, mappings): """format the publ infostring according to the format""" def replace(char, data): return data.get(char, char) return ''.join(replace(c, mappings) for c in format_string) def get_tags_config(config): """Fetch needs config from our config file""" # Probably "citation" unless this file gets renamed function = config.get("rank_method", "function") write_message("config function %s" % function, verbose=9) tags = {} # 037a: contains (often) the "hep-ph/0501084" tag of THIS record try: tag = config.get(function, "primary_report_number") except ConfigParser.NoOptionError: tags['record_pri_number'] = None else: tags['record_pri_number'] = tagify(parse_tag(tag)) # 088a: additional short identifier for the record try: tag = config.get(function, "additional_report_number") except ConfigParser.NoOptionError: tags['record_add_number'] = None else: tags['record_add_number'] = tagify(parse_tag(tag)) # 999C5r. this is in the reference list, refers to other records. # Looks like: hep-ph/0408002 try: tag = config.get(function, "reference_via_report_number") except ConfigParser.NoOptionError: tags['refs_report_number'] = None else: tags['refs_report_number'] = tagify(parse_tag(tag)) # 999C5s. this is in the reference list, refers to other records. # Looks like: Phys.Rev.,A21,78 try: tag = config.get(function, "reference_via_pubinfo") except ConfigParser.NoOptionError: tags['refs_journal'] = None else: tags['refs_journal'] = tagify(parse_tag(tag)) # 999C5a. this is in the reference list, refers to other records. # Looks like: 10.1007/BF03170733 try: tag = config.get(function, "reference_via_doi") except ConfigParser.NoOptionError: tags['refs_doi'] = None else: tags['refs_doi'] = tagify(parse_tag(tag)) # 999C50. this is in the reference list, refers to other records. # Looks like: 1205 try: tag = config.get(function, "reference_via_record_id") except ConfigParser.NoOptionError: tags['refs_record_id'] = None else: tags['refs_record_id'] = tagify(parse_tag(tag)) # 999C5i. this is in the reference list, refers to other records. # Looks like: 9781439520031 try: tag = config.get(function, "reference_via_isbn") except ConfigParser.NoOptionError: tags['refs_isbn'] = None else: tags['refs_isbn'] = tagify(parse_tag(tag)) # Fields needed to construct the journals for this record try: tag = { 'pages': config.get(function, "pubinfo_journal_page"), 'year': config.get(function, "pubinfo_journal_year"), 'journal': config.get(function, "pubinfo_journal_title"), 'volume': config.get(function, "pubinfo_journal_volume"), } except ConfigParser.NoOptionError: tags['publication'] = None else: tags['publication'] = { 'pages': tagify(parse_tag(tag['pages'])), 'year': tagify(parse_tag(tag['year'])), 'journal': tagify(parse_tag(tag['journal'])), 'volume': tagify(parse_tag(tag['volume'])), } # Fields needed to lookup the DOIs tags['doi'] = get_field_tags('doi') # Fields needed to lookup the ISBN tags['isbn'] = get_field_tags('isbn') # 999C5s. A standardized way of writing a reference in the reference list. # Like: Nucl. Phys. B 710 (2000) 371 try: tags['publication_format'] = config.get(function, "pubinfo_journal_format") except ConfigParser.NoOptionError: tags['publication_format'] = CFG_JOURNAL_PUBINFO_STANDARD_FORM # Print values of tags for debugging write_message("tag values: %r" % [tags], verbose=9) return tags def get_journal_info(record, tags): """Fetch journal info from given record""" record_info = [] journals_fields = record.find_fields(tags['publication']['journal'][:5]) for field in journals_fields: # we store the tags and their values here # like c->444 y->1999 p->"journal of foo", # v->20 tagsvalues = {} try: tmp = field.get_subfield_values(tags['publication']['journal'][5])[0] except IndexError: pass else: tagsvalues["p"] = tmp try: tmp = field.get_subfield_values(tags['publication']['volume'][5])[0] except IndexError: pass else: tagsvalues["v"] = tmp try: tmp = field.get_subfield_values(tags['publication']['year'][5])[0] except IndexError: pass else: tagsvalues["y"] = tmp try: tmp = field.get_subfield_values(tags['publication']['pages'][5])[0] except IndexError: pass else: # if the page numbers have "x-y" take just x tagsvalues["c"] = tmp.split('-', 1)[0] # check if we have the required data ok = True for c in tags['publication_format']: if c in ('p', 'v', 'y', 'c'): if c not in tagsvalues: ok = False if ok: publ = format_journal(tags['publication_format'], tagsvalues) record_info += [publ] alt_volume = get_alt_volume(tagsvalues['v']) if alt_volume: tagsvalues2 = tagsvalues.copy() tagsvalues2['v'] = alt_volume publ = format_journal(tags['publication_format'], tagsvalues2) record_info += [publ] # Add codens for coden in get_kb_mappings('CODENS', value=tagsvalues['p']): tagsvalues2 = tagsvalues.copy() tagsvalues2['p'] = coden['key'] publ = format_journal(tags['publication_format'], tagsvalues2) record_info += [publ] return record_info def get_alt_volume(volume): """Get alternate volume form We handle the inversed volume letter bug Some metadata is wrong which leads to journals with the volume letter at the end. e.g. Phys.Rev.,51B,1 instead of Phys.Rev.,B51,1 """ alt_volume = None if re.match(ur'[a-zA-Z]\d+', volume, re.U|re.I): alt_volume = volume[1:] + volume[0] elif re.match(ur'\d+[a-zA-Z]', volume, re.U|re.I): alt_volume = volume[-1] + volume[:-1] return alt_volume def get_citation_informations(recid_list, tags, config, fetch_catchup_info=True): """Scans the collections searching references (999C5x -fields) and citations for items in the recid_list returns a 4 list of dictionaries that contains the citation information of cds records examples: [ {} {} {} {} ] [ {5: 'SUT-DP-92-70-5'}, { 93: ['astro-ph/9812088']}, { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ] NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more. """ begin_time = os.times()[4] records_info = { 'report-numbers': {}, 'journals': {}, 'doi': {}, 'hdl': {}, 'isbn': {}, 'record_id': {}, } references_info = { 'report-numbers': {}, 'journals': {}, 'doi': {}, 'record_id': {}, 'isbn': {}, 'hdl': {}, } # perform quick check to see if there are some records with # reference tags, because otherwise get.cit.inf would be slow even # if there is nothing to index: for done, recid in enumerate(recid_list): if done % 10 == 0: task_sleep_now_if_required() if done % 50 == 0: mesg = "get cit.inf done %s of %s" % (done, len(recid_list)) write_message(mesg) task_update_progress(mesg) record = get_record(recid) records_info['record_id'][recid] = [unicode(recid)] function = config.get("rank_method", "function") if config.get(function, 'collections'): if recid not in recids_cache(config.get(function, 'collections')): # do not treat this record since it is not in the collections # we want to process continue elif recid in deleted_recids_cache(): # do not treat this record since it was deleted; we # skip it like this in case it was only soft-deleted # e.g. via bibedit (i.e. when collection tag 980 is # DELETED but other tags like report number or journal # publication info remained the same, so the calls to # get_fieldvalues() below would return old values) continue if tags['refs_report_number']: references_info['report-numbers'][recid] = [t.value for t in record.find_subfields(tags['refs_report_number'])] msg = "references_info['report-numbers'][%s] = %r" \ % (recid, references_info['report-numbers'][recid]) write_message(msg, verbose=9) if tags['refs_journal']: references_info['journals'][recid] = [] for ref in record.find_subfields(tags['refs_journal']): try: # Inspire specific parsing journal, volume, page = ref.value.split(',') except ValueError: pass else: alt_volume = get_alt_volume(volume) if alt_volume: alt_ref = ','.join([journal, alt_volume, page]) references_info['journals'][recid] += [alt_ref] references_info['journals'][recid] += [ref.value] msg = "references_info['journals'][%s] = %r" \ % (recid, references_info['journals'][recid]) write_message(msg, verbose=9) if tags['refs_doi']: references = [t.value for t in record.find_subfields(tags['refs_doi'])] dois = [] hdls = [] for ref in references: if ref.startswith("hdl:"): hdls.append(ref[4:]) elif ref.startswith("doi:"): dois.append(ref[4:]) else: dois.append(ref) references_info['doi'][recid] = dois references_info['hdl'][recid] = hdls msg = "references_info['doi'][%s] = %r" % (recid, dois) write_message(msg, verbose=9) msg = "references_info['hdl'][%s] = %r" % (recid, hdls) write_message(msg, verbose=9) if tags['refs_record_id']: references_info['record_id'][recid] = [t.value for t in record.find_subfields(tags['refs_record_id'])] msg = "references_info['record_id'][%s] = %r" \ % (recid, references_info['record_id'][recid]) write_message(msg, verbose=9) if tags['refs_isbn']: references_info['isbn'][recid] = [t.value for t in record.find_subfields(tags['refs_isbn'])] msg = "references_info['isbn'][%s] = %r" \ % (recid, references_info['isbn'][recid]) write_message(msg, verbose=9) if not fetch_catchup_info: # We do not need the extra info continue if tags['record_pri_number'] or tags['record_add_number']: records_info['report-numbers'][recid] = [] if tags['record_pri_number']: records_info['report-numbers'][recid] += [t.value for t in record.find_subfields(tags['record_pri_number'])] if tags['record_add_number']: records_info['report-numbers'][recid] += [t.value for t in record.find_subfields(tags['record_add_number'])] msg = "records_info[%s]['report-numbers'] = %r" \ % (recid, records_info['report-numbers'][recid]) write_message(msg, verbose=9) if tags['doi']: records_info['doi'][recid] = [] records_info['hdl'][recid] = [] for tag in tags['doi']: for field in record.find_fields(tag[:5]): if 'DOI' in field.get_subfield_values('2'): dois = field.get_subfield_values('a') records_info['doi'][recid].extend(dois) elif 'HDL' in field.get_subfield_values('2'): hdls = field.get_subfield_values('a') records_info['hdl'][recid].extend(hdls) msg = "records_info[%s]['doi'] = %r" \ % (recid, records_info['doi'][recid]) write_message(msg, verbose=9) msg = "records_info[%s]['hdl'] = %r" \ % (recid, records_info['hdl'][recid]) write_message(msg, verbose=9) if tags['isbn']: records_info['isbn'][recid] = [] for tag in tags['isbn']: values = [t.value for t in record.find_subfields(tag)] records_info['isbn'][recid] += values msg = "records_info[%s]['isbn'] = %r" \ % (recid, records_info['isbn'][recid]) write_message(msg, verbose=9) # get a combination of # journal vol (year) pages if tags['publication']: records_info['journals'][recid] = get_journal_info(record, tags) msg = "records_info[%s]['journals'] = %r" \ % (recid, records_info['journals'][recid]) write_message(msg, verbose=9) mesg = "get cit.inf done fully" write_message(mesg) task_update_progress(mesg) end_time = os.times()[4] write_message("Execution time for generating citation info " "from record: %.2f sec" % (end_time - begin_time)) return records_info, references_info def standardize_report_number(report_number): """Format the report number to a standard form. Currently we: * remove category for arxiv papers """ report_number = re.sub(ur'(?:arXiv:)?(\d{4}\.\d{4}) \[[a-zA-Z\.-]+\]', ur'arXiv:\g<1>', report_number, re.I | re.U) return report_number def ref_analyzer(citation_informations, updated_recids, tags, config): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations = {} for recid in updated_recids: citations[recid] = set() references = {} for recid in updated_recids: references[recid] = set() def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_cites(citer, citee): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == citee: return citations[citee].add(citer) if citer in updated_recids: references[citer].add(citee) def add_to_refs(citer, citee): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == citee: return if citee in updated_recids: citations[citee].add(citer) references[citer].add(citee) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in references_info['report-numbers'].iteritems(): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field, config=config) write_message("These match searching %s in %s: %s" % (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning('multiple-matches', refnumber) msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in references_info['journals'].iteritems(): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning('not-well-formed', p) msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in references_info['doi'].iteritems(): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Try to find references based on 999C5a (hdl references) # e.g. 4263537/4000 write_message("Phase 4: HDL references") done = 0 for thisrecid, refs in references_info['hdl'].iteritems(): step("HDL references", thisrecid, done, len(references_info['hdl'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'hdl' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' HDL value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t5 = os.times()[4] # Try to find references based on 999C50 # e.g. 1244 write_message("Phase 5: Record ID references") done = 0 for thisrecid, refs in references_info['record_id'].iteritems(): step("Record ID references", thisrecid, done, len(references_info['record_id'])) done += 1 field = "001" for recid in (r for r in refs if r): valid = get_recids_matching_query(p=recid, f=field, config=config) write_message("These match searching %s in %s: %s" % (recid, field, list(valid)), verbose=9) if valid: add_to_refs(thisrecid, valid[0]) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t6 = os.times()[4] # Try to find references based on 999C5i # e.g. 978-3-942171-73-1 write_message("Phase 6: ISBN references") done = 0 for thisrecid, refs in references_info['isbn'].iteritems(): step("ISBN references", thisrecid, done, len(references_info['isbn'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'isbn' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' ISBN value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t7 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 7: report numbers catchup") done = 0 for thisrecid, reportcodes in records_info['report-numbers'].iteritems(): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query(p=report_pattern, f=tags['refs_report_number'], m='r', config=config) else: recids = get_recids_matching_query(p=reportcode, f=tags['refs_report_number'], config=config) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 8: journals catchup") done = 0 t8 = os.times()[4] for thisrecid, rec_journals in records_info['journals'].iteritems(): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = get_recids_matching_query(p=journal, f=tags['refs_journal'], config=config) write_message("These records match %s in %s: %s" % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 9: DOI catchup") done = 0 t9 = os.times()[4] for thisrecid, dois in records_info['doi'].iteritems(): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: recids = get_recids_matching_query(p=doi, f=tags['refs_doi'], config=config) write_message("These records match %s in %s: %s" % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 10: HDL catchup") done = 0 t10 = os.times()[4] for thisrecid, hdls in records_info['hdl'].iteritems(): step("HDL catchup", thisrecid, done, len(records_info['hdl'])) done += 1 for hdl in hdls: recids = get_recids_matching_query(p=hdl, f=tags['refs_doi'], config=config) write_message("These records match %s in %s: %s" % (hdl, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 11: ISBN catchup") done = 0 t11 = os.times()[4] for thisrecid, isbns in records_info['isbn'].iteritems(): step("ISBN catchup", thisrecid, done, len(records_info['isbn'])) done += 1 for isbn in isbns: recids = get_recids_matching_query(p=isbn, f=tags['refs_isbn'], config=config) write_message("These records match %s in %s: %s" % (isbn, tags['refs_isbn'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) write_message("Phase 12: Record ID catchup") done = 0 t12 = os.times()[4] for thisrecid, record_ids in records_info['record_id'].iteritems(): step("Record ID catchup", thisrecid, done, len(records_info['record_id'])) done += 1 for record_id in record_ids: recids = get_recids_matching_query(p=record_id, f=tags['refs_record_id'], config=config) write_message("These records match %s in %s: %s" % (record_id, tags['refs_record_id'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(citations.iteritems(), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(references.iteritems(), 10))) write_message("size: %s" % len(references)) t13 = os.times()[4] write_message("Execution time for analyzing the citation information " "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2-t1)) write_message("... checking ref journals: %.2f sec" % (t3-t2)) write_message("... checking ref DOI: %.2f sec" % (t4-t3)) write_message("... checking ref HDL: %.2f sec" % (t5-t4)) write_message("... checking ref Record ID: %.2f sec" % (t6-t5)) write_message("... checking ref ISBN: %.2f sec" % (t7-t6)) write_message("... checking rec report numbers: %.2f sec" % (t8-t7)) write_message("... checking rec journals: %.2f sec" % (t9-t8)) write_message("... checking rec DOI: %.2f sec" % (t10-t9)) write_message("... checking rec HDL: %.2f sec" % (t11-t10)) write_message("... checking rec ISBN: %.2f sec" % (t12-t11)) write_message("... checking rec Record ID: %.2f sec" % (t13-t12)) write_message("... total time of ref_analyze: %.2f sec" % (t13-t1)) return citations, references def compute_refs_diff(recid, new_refs): """ Given a set of references for a record, returns how many references were added to it. The value can be negative which means the record lost citations. """ old_refs = set(row[0] for row in run_sql("""SELECT citee FROM rnkCITATIONDICT WHERE citer = %s""", [recid])) refs_to_add = new_refs - old_refs refs_to_delete = old_refs - new_refs return len(refs_to_add) - len(refs_to_delete) def compute_cites_diff(recid, new_cites): """ This function does the same thing as compute_refs_diff but with citations. """ old_cites = set(row[0] for row in run_sql("""SELECT citer FROM rnkCITATIONDICT WHERE citee = %s""", [recid])) cites_to_add = new_cites - old_cites cites_to_delete = old_cites - new_cites return len(cites_to_add) - len(cites_to_delete) def compute_dicts_diff(recids, refs, cites): """ Given the new dictionaries for references and citations, computes how many references were added or removed by comparing them to the current stored in the database. """ cites_diff = 0 for recid in recids: cites_diff += compute_refs_diff(recid, refs[recid]) cites_diff += compute_cites_diff(recid, cites[recid]) return cites_diff def store_dicts(recids, refs, cites): """Insert the reference and citation list into the database""" for recid in recids: replace_refs(recid, refs[recid]) replace_cites(recid, cites[recid]) def replace_refs(recid, new_refs): """ Given a set of references, replaces the references of given recid in the database. The changes are logged into rnkCITATIONLOG. """ old_refs = set(row[0] for row in run_sql("""SELECT citee FROM rnkCITATIONDICT WHERE citer = %s""", [recid])) refs_to_add = new_refs - old_refs refs_to_delete = old_refs - new_refs for ref in refs_to_add: write_message('adding ref %s %s' % (recid, ref), verbose=1) now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("""INSERT INTO rnkCITATIONDICT (citer, citee, last_updated) VALUES (%s, %s, %s)""", (recid, ref, now)) run_sql("""INSERT INTO rnkCITATIONLOG (citer, citee, type, action_date) VALUES (%s, %s, %s, %s)""", (recid, ref, 'added', now)) for ref in refs_to_delete: write_message('deleting ref %s %s' % (recid, ref), verbose=1) now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("""DELETE FROM rnkCITATIONDICT WHERE citer = %s and citee = %s""", (recid, ref)) run_sql("""INSERT INTO rnkCITATIONLOG (citer, citee, type, action_date) VALUES (%s, %s, %s, %s)""", (recid, ref, 'removed', now)) def replace_cites(recid, new_cites): """ Given a set of citations, replaces the citations of given recid in the database. The changes are logged into rnkCITATIONLOG. See @replace_refs """ old_cites = set(row[0] for row in run_sql("""SELECT citer FROM rnkCITATIONDICT WHERE citee = %s""", [recid])) cites_to_add = new_cites - old_cites cites_to_delete = old_cites - new_cites for cite in cites_to_add: write_message('adding cite %s %s' % (recid, cite), verbose=1) now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("""INSERT INTO rnkCITATIONDICT (citee, citer, last_updated) VALUES (%s, %s, %s)""", (recid, cite, now)) run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date) VALUES (%s, %s, %s, %s)""", (recid, cite, 'added', now)) for cite in cites_to_delete: write_message('deleting cite %s %s' % (recid, cite), verbose=1) now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("""DELETE FROM rnkCITATIONDICT WHERE citee = %s and citer = %s""", (recid, cite)) run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date) VALUES (%s, %s, %s, %s)""", (recid, cite, 'removed', now)) def insert_into_missing(recid, report): """Mark reference string as missing. If a reference is a report number / journal / DOI but we do not have the corresping record in the database, we mark that particualar reference string as missing, by adding a row in rnkCITATIONDATAEXT. The recid represents the record containing the reference string. """ if len(report) >= 255: # Invalid report, it is too long # and does not fit in the database column # (currently varchar 255) return wasalready = run_sql("""SELECT id_bibrec FROM rnkCITATIONDATAEXT WHERE id_bibrec = %s AND extcitepubinfo = %s""", (recid, report)) if not wasalready: run_sql("""INSERT INTO rnkCITATIONDATAEXT(id_bibrec, extcitepubinfo) VALUES (%s,%s)""", (recid, report)) def remove_from_missing(report): """Remove the reference string from the missing table See @insert_into_missing""" run_sql("""DELETE FROM rnkCITATIONDATAEXT WHERE extcitepubinfo = %s""", (report,)) def print_missing(num): """ Print the contents of rnkCITATIONDATAEXT table containing external records that were cited by NUM or more internal records. NUM is by default taken from the -E command line option. """ if not num: num = task_get_option("print-extcites") write_message("Listing external papers cited by %i or more \ internal records:" % num) res = run_sql("""SELECT COUNT(id_bibrec), extcitepubinfo FROM rnkCITATIONDATAEXT GROUP BY extcitepubinfo HAVING COUNT(id_bibrec) >= %s ORDER BY COUNT(id_bibrec) DESC""", (num,)) for cnt, brec in res: print str(cnt), "\t", brec write_message("Listing done.") def tagify(parsedtag): """aux auf to make '100__a' out of ['100','','','a']""" tag = "" for t in parsedtag: if t == '': t = '_' tag += t return tag def store_citation_warning(warning_type, cit_info): """Store citation indexing warnings in the database If we encounter a problem during the citation indexing, such as multiple results for a report number, we store a warning in rnkCITATIONDATAERR """ r = run_sql("""SELECT 1 FROM rnkCITATIONDATAERR WHERE type = %s AND citinfo = %s""", (warning_type, cit_info)) if not r: run_sql("""INSERT INTO rnkCITATIONDATAERR (type, citinfo) VALUES (%s, %s)""", (warning_type, cit_info)) diff --git a/modules/bibrank/lib/bibrank_citerank_indexer.py b/modules/bibrank/lib/bibrank_citerank_indexer.py index 93377acb8..e7b3279e7 100644 --- a/modules/bibrank/lib/bibrank_citerank_indexer.py +++ b/modules/bibrank/lib/bibrank_citerank_indexer.py @@ -1,881 +1,881 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2009, 2010, 2011 CERN. +# Copyright (C) 2009, 2010, 2011, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Implementation of different ranking methods based on the citation graph: - citation count/ time decayed citation count - pagerank / pagerank with external citations - time decayed pagerank """ # pylint: disable=E0611 import ConfigParser from math import exp import datetime import time import re import sys try: from numpy import array, ones, zeros, int32, float32, sqrt, dot import_numpy = 1 except ImportError: import_numpy = 0 if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 from invenio.dbquery import run_sql, serialize_via_marshal from invenio.bibtask import write_message from invenio.config import CFG_ETCDIR def get_citations_from_file(filename): """gets the citation data (who cites who) from a file and returns - a dictionary of type x:{x1,x2..}, where x is cited by x1,x2.. - a dictionary of type a:{b}, where recid 'a' is asociated with an index 'b' """ cit = {} dict_of_ids = {} count = 0 try: citation_file = open(filename, "r") except StandardError: write_message("Cannot find file: %s" % filename, sys.stderr) raise StandardError for line in citation_file: tokens = line.strip().split() recid_cites = int(tokens[0]) recid_cited = int(tokens[1]) if recid_cited not in cit: cit[recid_cited] = [] #without this, duplicates might be introduced if recid_cites not in cit[recid_cited] and recid_cites != recid_cited: cit[recid_cited].append(recid_cites) if recid_cites not in dict_of_ids: dict_of_ids[recid_cites] = count count += 1 if recid_cited not in dict_of_ids: dict_of_ids[recid_cited] = count count += 1 citation_file.close() write_message("Citation data collected from file: %s" %filename, verbose=2) write_message("Ids and recids corespondace: %s" \ %str(dict_of_ids), verbose=9) write_message("Citations: %s" % str(cit), verbose=9) return cit, dict_of_ids def get_citations_from_db(): """gets the citation data (who cites who) from the rnkCITATIONDATA table, and returns: -a dictionary of type x:{x1,x2..}, where x is cited by x1,x2.. -a dict of type a:{b} where recid 'a' is asociated with an index 'b'""" dict_of_ids = {} cit = {} rows = run_sql("SELECT citer, citee FROM rnkCITATIONDICT") for citer, citee in rows: cit.setdefault(citee, set()).add(citer) count = 0 for item in cit: if item in cit[item]: cit[item].remove(item) if item not in dict_of_ids: dict_of_ids[item] = count count += 1 for value in cit[item]: if value not in dict_of_ids: dict_of_ids[value] = count count += 1 write_message("Citation data collected", verbose=2) write_message("Ids and recids correspondence: %s" \ % str(dict_of_ids), verbose=9) write_message("Citations: %s" % str(cit), verbose=9) return cit, dict_of_ids def construct_ref_array(cit, dict_of_ids, len_): """returns an array with the number of references that each recid has """ ref = array((), int32) ref = zeros(len_, int32) for key in cit: for value in cit[key]: ref[dict_of_ids[value]] += 1 write_message("Number of references: %s" %str(ref), verbose=9) write_message("Finished computing total number \ of references for each paper.", verbose=5) return ref def get_external_links_from_file(filename, ref, dict_of_ids): """returns a dictionary containing the number of external links for each recid external link=citation that is not in our database """ ext_links = {} #format: ext_links[dict_of_ids[recid]]=number of total external links try: external_file = open(filename, "r") except StandardError: write_message("Cannot find file: %s" % filename, sys.stderr) raise StandardError for line in external_file: tokens = line.strip().split() recid = int(tokens[0]) nr_of_external = int(tokens[1]) ext_links[dict_of_ids[recid]] = nr_of_external - ref[dict_of_ids[recid]] if ext_links[dict_of_ids[recid]] < 0: ext_links[dict_of_ids[recid]] = 0 external_file.close() write_message("External link information extracted", verbose=2) return ext_links def get_external_links_from_db_old(ref, dict_of_ids, reference_indicator): """returns a dictionary containing the number of external links for each recid external link=citation that is not in our database """ ext_links = {} reference_tag_regex = reference_indicator + "[a-z]" for recid in dict_of_ids: query = "select COUNT(DISTINCT field_number) from bibrec_bib99x \ where id_bibrec='%s' and id_bibxxx in \ (select id from bib99x where tag RLIKE '%s');" \ % (str(recid), reference_tag_regex) result_set = run_sql(query) if result_set: total_links = int(result_set[0][0]) internal_links = ref[dict_of_ids[recid]] ext_links[dict_of_ids[recid]] = total_links - internal_links if ext_links[dict_of_ids[recid]] < 0: ext_links[dict_of_ids[recid]] = 0 else: ext_links[dict_of_ids[recid]] = 0 write_message("External link information extracted", verbose=2) write_message("External links: %s" % str(ext_links), verbose=9) return ext_links def get_external_links_from_db(ref, dict_of_ids, reference_indicator): """returns a dictionary containing the number of external links for each recid external link=citation that is not in our database """ ext_links = {} dict_all_ref = {} for recid in dict_of_ids: dict_all_ref[recid] = 0 ext_links[dict_of_ids[recid]] = 0 reference_db_id = reference_indicator[0:2] reference_tag_regex = reference_indicator + "[a-z]" tag_list = run_sql("select id from bib" + reference_db_id + \ "x where tag RLIKE %s", (reference_tag_regex, )) tag_set = set() for tag in tag_list: tag_set.add(tag[0]) ref_list = run_sql("select id_bibrec, id_bibxxx, field_number from \ bibrec_bib" + reference_db_id + "x group by \ id_bibrec, field_number") for item in ref_list: recid = int(item[0]) id_bib = int(item[1]) if recid in dict_of_ids and id_bib in tag_set: dict_all_ref[recid] += 1 for recid in dict_of_ids: total_links = dict_all_ref[recid] internal_links = ref[dict_of_ids[recid]] ext_links[dict_of_ids[recid]] = total_links - internal_links if ext_links[dict_of_ids[recid]] < 0: ext_links[dict_of_ids[recid]] = 0 write_message("External link information extracted", verbose=2) write_message("External links: %s" % str(ext_links), verbose=9) return ext_links def avg_ext_links_with_0(ext_links): """returns the average number of external links per paper including in the counting the papers with 0 external links""" total = 0.0 for item in ext_links: total += ext_links[item] avg_ext = total/len(ext_links) write_message("The average number of external links per paper (including \ papers with 0 external links) is: %s" % str(avg_ext), verbose=3) return avg_ext def avg_ext_links_without_0(ext_links): """returns the average number of external links per paper excluding in the counting the papers with 0 external links""" count = 0.0 total = 0.0 for item in ext_links: if ext_links[item] != 0: count += 1 total += ext_links[item] avg_ext = total/count write_message("The average number of external links per paper (excluding \ papers with 0 external links) is: %s" % str(avg_ext), verbose=3) return avg_ext def leaves(ref): """returns the number of papers that do not cite any other paper""" nr_of_leaves = 0 for i in ref: if i == 0: nr_of_leaves += 1 write_message("The number of papers that do not cite \ any other papers: %s" % str(leaves), verbose=3) return nr_of_leaves def get_dates_from_file(filename, dict_of_ids): """Returns the year of the publication for each paper. In case the year is not in the db, the year of the submission is taken""" dates = {} # the format is: dates[dict_of_ids[recid]] = year try: dates_file = open(filename, "r") except StandardError: write_message("Cannot find file: %s" % filename, sys.stderr) raise StandardError for line in dates_file: tokens = line.strip().split() recid = int(tokens[0]) year = int(tokens[1]) dates[dict_of_ids[recid]] = year dates_file.close() write_message("Dates extracted", verbose=2) write_message("Dates dictionary %s" % str(dates), verbose=9) return dates def get_dates_from_db(dict_of_ids, publication_year_tag, creation_date_tag): """Returns the year of the publication for each paper. In case the year is not in the db, the year of the submission is taken""" current_year = int(datetime.datetime.now().strftime("%Y")) publication_year_db_id = publication_year_tag[0:2] creation_date_db_id = creation_date_tag[0:2] total = 0 count = 0 dict_of_dates = {} for recid in dict_of_ids: dict_of_dates[recid] = 0 date_list = run_sql("select id, tag, value from bib" + \ publication_year_db_id + "x where tag=%s", \ (publication_year_tag, )) date_dict = {} for item in date_list: date_dict[int(item[0])] = item[2] pattern = re.compile('.*(\d{4}).*') date_list = run_sql("select id_bibrec, id_bibxxx, field_number \ from bibrec_bib" + publication_year_db_id +"x") for item in date_list: recid = int(item[0]) id_ = int(item[1]) if id_ in date_dict and recid in dict_of_dates: reg = pattern.match(date_dict[id_]) if reg: date = int(reg.group(1)) if date > 1000 and date <= current_year: dict_of_dates[recid] = date total += date count += 1 not_covered = [] for recid in dict_of_dates: if dict_of_dates[recid] == 0: not_covered.append(recid) date_list = run_sql("select id, tag, value from bib" + \ creation_date_db_id + "x where tag=%s", \ (creation_date_tag, )) date_dict = {} for item in date_list: date_dict[int(item[0])] = item[2] date_list = run_sql("select id_bibrec, id_bibxxx, field_number \ from bibrec_bib" + creation_date_db_id + "x") for item in date_list: recid = int(item[0]) id_ = int(item[1]) if id_ in date_dict and recid in not_covered: date = int(str(date_dict[id_])[0:4]) if date > 1000 and date <= current_year: dict_of_dates[recid] = date total += date count += 1 dates = {} med = total/count for recid in dict_of_dates: if dict_of_dates[recid] == 0: dates[dict_of_ids[recid]] = med else: dates[dict_of_ids[recid]] = dict_of_dates[recid] write_message("Dates extracted", verbose=2) write_message("Dates dictionary %s" % str(dates), verbose=9) return dates def construct_sparse_matrix(cit, ref, dict_of_ids, len_, damping_factor): """returns several structures needed in the calculation of the PAGERANK method using this structures, we don't need to keep the full matrix in the memory""" sparse = {} for item in cit: for value in cit[item]: sparse[(dict_of_ids[item], dict_of_ids[value])] = \ damping_factor * 1.0/ref[dict_of_ids[value]] semi_sparse = [] for j in range(len_): if ref[j] == 0: semi_sparse.append(j) semi_sparse_coeficient = damping_factor/len_ #zero_coeficient = (1-damping_factor)/len_ write_message("Sparse information calculated", verbose=3) return sparse, semi_sparse, semi_sparse_coeficient def construct_sparse_matrix_ext(cit, ref, ext_links, dict_of_ids, alpha, beta): """if x doesn't cite anyone: cites everyone : 1/len_ -- should be used! returns several structures needed in the calculation of the PAGERANK_EXT method""" len_ = len(dict_of_ids) sparse = {} semi_sparse = {} sparse[0, 0] = 1.0 - alpha for j in range(len_): sparse[j+1, 0] = alpha/(len_) if j not in ext_links: sparse[0, j+1] = beta/(len_ + beta) else: if ext_links[j] == 0: sparse[0, j+1] = beta/(len_ + beta) else: aux = beta * ext_links[j] if ref[j] == 0: sparse[0, j+1] = aux/(aux + len_) else: sparse[0, j+1] = aux/(aux + ref[j]) if ref[j] == 0: semi_sparse[j+1] = (1.0 - sparse[0, j + 1])/len_ for item in cit: for value in cit[item]: sparse[(dict_of_ids[item] + 1, dict_of_ids[value] + 1)] = \ (1.0 - sparse[0, dict_of_ids[value] + 1])/ref[dict_of_ids[value]] #for i in range(len_ + 1): # a = "" # for j in range (len_ + 1): # if (i,j) in sparse: # a += str(sparse[(i,j)]) + "\t" # else: # a += "0\t" # print a #print semi_sparse write_message("Sparse information calculated", verbose=3) return sparse, semi_sparse def construct_sparse_matrix_time(cit, ref, dict_of_ids, \ damping_factor, date_coef): """returns several structures needed in the calculation of the PAGERANK_time method using this structures, we don't need to keep the full matrix in the memory""" len_ = len(dict_of_ids) sparse = {} for item in cit: for value in cit[item]: sparse[(dict_of_ids[item], dict_of_ids[value])] = damping_factor * \ date_coef[dict_of_ids[value]]/ref[dict_of_ids[value]] semi_sparse = [] for j in range(len_): if ref[j] == 0: semi_sparse.append(j) semi_sparse_coeficient = damping_factor/len_ #zero_coeficient = (1-damping_factor)/len_ write_message("Sparse information calculated", verbose=3) return sparse, semi_sparse, semi_sparse_coeficient def statistics_on_sparse(sparse): """returns the number of papers that cite themselves""" count_diag = 0 for (i, j) in sparse.keys(): if i == j: count_diag += 1 write_message("The number of papers that cite themselves: %s" % \ str(count_diag), verbose=3) return count_diag def pagerank(conv_threshold, check_point, len_, sparse, \ semi_sparse, semi_sparse_coef): """the core function of the PAGERANK method returns an array with the ranks coresponding to each recid""" weights_old = ones((len_), float32) # initial weights weights_new = array((), float32) converged = False nr_of_check_points = 0 difference = len_ while not converged: nr_of_check_points += 1 for step in (range(check_point)): weights_new = zeros((len_), float32) for (i, j) in sparse.keys(): weights_new[i] += sparse[(i, j)]*weights_old[j] semi_total = 0.0 for j in semi_sparse: semi_total += weights_old[j] weights_new = weights_new + semi_sparse_coef * semi_total + \ (1.0/len_ - semi_sparse_coef) * sum(weights_old) if step == check_point - 1: diff = weights_new - weights_old difference = sqrt(dot(diff, diff))/len_ write_message("Finished step: %s, %s " \ %(str(check_point*(nr_of_check_points-1) + step), \ str(difference)), verbose=5) weights_old = weights_new.copy() converged = (difference < conv_threshold) write_message("PageRank calculated for all recids finnished in %s steps. \ The threshold was %s" % (str(nr_of_check_points), str(difference)),\ verbose=2) return weights_old def pagerank_ext(conv_threshold, check_point, len_, sparse, semi_sparse): """the core function of the PAGERANK_EXT method returns an array with the ranks coresponding to each recid""" weights_old = array((), float32) weights_old = ones((len_), float32) weights_new = array((), float32) converged = False nr_of_check_points = 0 difference = len_ while not converged: nr_of_check_points += 1 for step in (range(check_point)): weights_new = zeros((len_), float32) for (i, j) in sparse.keys(): weights_new[i] += sparse[(i, j)]*weights_old[j] total_sum = 0.0 for j in semi_sparse: total_sum += semi_sparse[j]*weights_old[j] weights_new[1:len_] = weights_new[1:len_] + total_sum if step == check_point - 1: diff = weights_new - weights_old difference = sqrt(dot(diff, diff))/len_ write_message("Finished step: %s, %s " \ % (str(check_point*(nr_of_check_points-1) + step), \ str(difference)), verbose=5) weights_old = weights_new.copy() converged = (difference < conv_threshold) write_message("PageRank calculated for all recids finnished in %s steps. \ The threshold was %s" % (str(nr_of_check_points), \ str(difference)), verbose=2) #return weights_old[1:len_]/(len_ - weights_old[0]) return weights_old[1:len_] def pagerank_time(conv_threshold, check_point, len_, \ sparse, semi_sparse, semi_sparse_coeficient, date_coef): """the core function of the PAGERANK_TIME method: pageRank + time decay returns an array with the ranks coresponding to each recid""" weights_old = array((), float32) weights_old = ones((len_), float32) # initial weights weights_new = array((), float32) converged = False nr_of_check_points = 0 difference = len_ while not converged: nr_of_check_points += 1 for step in (range(check_point)): weights_new = zeros((len_), float32) for (i, j) in sparse.keys(): weights_new[i] += sparse[(i, j)]*weights_old[j] semi_total = 0.0 for j in semi_sparse: semi_total += weights_old[j]*date_coef[j] zero_total = 0.0 for i in range(len_): zero_total += weights_old[i]*date_coef[i] #dates = array(date_coef.keys()) #zero_total = dot(weights_old, dates) weights_new = weights_new + semi_sparse_coeficient * semi_total + \ (1.0/len_ - semi_sparse_coeficient) * zero_total if step == check_point - 1: diff = weights_new - weights_old difference = sqrt(dot(diff, diff))/len_ write_message("Finished step: %s, %s " \ % (str(check_point*(nr_of_check_points-1) + step), \ str(difference)), verbose=5) weights_old = weights_new.copy() converged = (difference < conv_threshold) write_message("PageRank calculated for all recids finnished in %s steps.\ The threshold was %s" % (str(nr_of_check_points), \ str(difference)), verbose=2) return weights_old def citation_rank_time(cit, dict_of_ids, date_coef, dates, decimals): """returns a dictionary recid:weight based on the total number of citations as function of time""" dict_of_ranks = {} for key in dict_of_ids: if key in cit: dict_of_ranks[key] = 0 for recid in cit[key]: dict_of_ranks[key] += date_coef[dict_of_ids[recid]] dict_of_ranks[key] = round(dict_of_ranks[key], decimals) \ + dates[dict_of_ids[key]]* pow(10, 0-4-decimals) else: dict_of_ranks[key] = dates[dict_of_ids[key]]* pow(10, 0-4-decimals) write_message("Citation rank calculated", verbose=2) return dict_of_ranks def get_ranks(weights, dict_of_ids, mult, dates, decimals): """returns a dictionary recid:value, where value is the weight of the recid paper; the second order is the reverse time order, from recent to past""" dict_of_ranks = {} for item in dict_of_ids: dict_of_ranks[item] = round(weights[dict_of_ids[item]]* mult, decimals)\ + dates[dict_of_ids[item]]* pow(10, 0-4-decimals) #dict_of_ranks[item] = weights[dict_of_ids[item]] return dict_of_ranks def sort_weights(dict_of_ranks): """sorts the recids based on weights(first order) and on dates(second order)""" ranks_by_citations = sorted(dict_of_ranks.keys(), lambda x, y: \ cmp(dict_of_ranks[y], dict_of_ranks[x])) return ranks_by_citations def normalize_weights(dict_of_ranks): """the weights should be normalized to 100, so they woun't be different from the weights from other ranking methods""" max_weight = 0.0 for recid in dict_of_ranks: weight = dict_of_ranks[recid] if weight > max_weight: max_weight = weight for recid in dict_of_ranks: dict_of_ranks[recid] = round(dict_of_ranks[recid] * 100.0/max_weight, 3) def write_first_ranks_to_file(ranks_by_citations, dict_of_ranks, \ nr_of_ranks, filename): """Writes the first n results of the ranking method into a file""" try: ranks_file = open(filename, "w") except StandardError: write_message("Problems with file: %s" % filename, sys.stderr) raise StandardError for i in range(nr_of_ranks): ranks_file.write(str(i+1) + "\t" + str(ranks_by_citations[i]) + \ "\t" + str(dict_of_ranks[ranks_by_citations[i]]) + "\n") ranks_file.close() write_message("The first %s pairs recid:rank in the ranking order \ are written into this file: %s" % (nr_of_ranks, filename), verbose=2) def del_rank_method_data(rank_method_code): """Delete the data for a rank method from rnkMETHODDATA table""" id_ = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) run_sql("DELETE FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id_[0][0], )) def into_db(dict_of_ranks, rank_method_code): """Writes into the rnkMETHODDATA table the ranking results""" method_id = run_sql("SELECT id from rnkMETHOD where name=%s", \ (rank_method_code, )) del_rank_method_data(rank_method_code) serialized_data = serialize_via_marshal(dict_of_ranks) method_id_str = str(method_id[0][0]) run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) \ - VALUES(%s, %s) ", (method_id_str, serialized_data, )) + VALUES(%s, _binary %s) ", (method_id_str, serialized_data, )) date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s", \ (date, rank_method_code)) write_message("Finished writing the ranks into rnkMETHOD table", verbose=5) def run_pagerank(cit, dict_of_ids, len_, ref, damping_factor, \ conv_threshold, check_point, dates): """returns the final form of the ranks when using pagerank method""" write_message("Running the PageRank method", verbose=5) sparse, semi_sparse, semi_sparse_coeficient = \ construct_sparse_matrix(cit, ref, dict_of_ids, len_, damping_factor) weights = pagerank(conv_threshold, check_point, len_, \ sparse, semi_sparse, semi_sparse_coeficient) dict_of_ranks = get_ranks(weights, dict_of_ids, 1, dates, 2) return dict_of_ranks def run_pagerank_ext(cit, dict_of_ids, ref, ext_links, \ conv_threshold, check_point, alpha, beta, dates): """returns the final form of the ranks when using pagerank_ext method""" write_message("Running the PageRank with external links method", verbose=5) len_ = len(dict_of_ids) sparse, semi_sparse = construct_sparse_matrix_ext(cit, ref, \ ext_links, dict_of_ids, alpha, beta) weights = pagerank_ext(conv_threshold, check_point, \ len_ + 1, sparse, semi_sparse) dict_of_ranks = get_ranks(weights, dict_of_ids, 1, dates, 2) return dict_of_ranks def run_pagerank_time(cit, dict_of_ids, len_, ref, damping_factor, \ conv_threshold, check_point, date_coef, dates): """returns the final form of the ranks when using pagerank + time decay method""" write_message("Running the PageRank_time method", verbose=5) sparse, semi_sparse, semi_sparse_coeficient = \ construct_sparse_matrix_time(cit, ref, dict_of_ids, \ damping_factor, date_coef) weights = pagerank_time(conv_threshold, check_point, len_, \ sparse, semi_sparse, semi_sparse_coeficient, date_coef) dict_of_ranks = get_ranks(weights, dict_of_ids, 100000, dates, 2) return dict_of_ranks def run_citation_rank_time(cit, dict_of_ids, date_coef, dates): """returns the final form of the ranks when using citation count as function of time method""" write_message("Running the citation rank with time decay method", verbose=5) dict_of_ranks = citation_rank_time(cit, dict_of_ids, date_coef, dates, 2) return dict_of_ranks def spearman_rank_correlation_coef(rank1, rank2, len_): """rank1 and rank2 are arrays containing the recids in the ranking order returns the corelation coeficient (-1 <= c <= 1) between 2 rankings the closec c is to 1, the more correlated are the two ranking methods""" total = 0 for i in range(len_): rank_value = rank2.index(rank1[i]) total += (i - rank_value)*(i - rank_value) return 1 - (6.0 * total) / (len_*(len_*len_ - 1)) def remove_loops(cit, dates, dict_of_ids): """when using time decay, new papers that are part of a loop are accumulating a lot of fake weight""" new_cit = {} for recid in cit: new_cit[recid] = [] for cited_by in cit[recid]: if dates[dict_of_ids[cited_by]] >= dates[dict_of_ids[recid]]: if cited_by in cit: if recid not in cit[cited_by]: new_cit[recid].append(cited_by) else: write_message("Loop removed: %s <-> %s" \ %(cited_by, recid), verbose=9) else: new_cit[recid].append(cited_by) else: write_message("Loop removed: %s <-> %s" \ %(cited_by, recid), verbose=9) write_message("Simple loops removed", verbose=5) return new_cit def calculate_time_weights(len_, time_decay, dates): """calculates the time coeficients for each paper""" current_year = int(datetime.datetime.now().strftime("%Y")) date_coef = {} for j in range(len_): date_coef[j] = exp(time_decay*(dates[j] - current_year)) write_message("Time weights calculated", verbose=5) write_message("Time weights: %s" % str(date_coef), verbose=9) return date_coef def get_dates(function, config, dict_of_ids): """returns a dictionary containing the year of publishing for each paper""" try: file_for_dates = config.get(function, "file_with_dates") dates = get_dates_from_file(file_for_dates, dict_of_ids) except (ConfigParser.NoOptionError, StandardError), err: write_message("If you want to read the dates from file set up the \ 'file_for_dates' variable in the config file [%s]" %err, verbose=3) try: publication_year_tag = config.get(function, "publication_year_tag") dummy = int(publication_year_tag[0:3]) except (ConfigParser.NoOptionError, StandardError): write_message("You need to set up correctly the publication_year_tag \ in the cfg file", sys.stderr) raise Exception try: creation_date_tag = config.get(function, "creation_date_tag") dummy = int(creation_date_tag[0:3]) except (ConfigParser.NoOptionError, StandardError): write_message("You need to set up correctly the creation_date_tag \ in the cfg file", sys.stderr) raise Exception dates = get_dates_from_db(dict_of_ids, publication_year_tag, \ creation_date_tag) return dates def citerank(rank_method_code): """new ranking method based on the citation graph""" write_message("Running rank method: %s" % rank_method_code, verbose=0) if not import_numpy: write_message('The numpy package could not be imported. \ This package is compulsory for running the citerank methods.') return try: file_ = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg" config = ConfigParser.ConfigParser() config.readfp(open(file_)) except StandardError: write_message("Cannot find configuration file: %s" % file_, sys.stderr) raise StandardError # the file for citations needs to have the following format: #each line needs to be x[tab]y, where x cites y; x,y are recids function = config.get("rank_method", "function") try: file_for_citations = config.get(function, "file_with_citations") cit, dict_of_ids = get_citations_from_file(file_for_citations) except (ConfigParser.NoOptionError, StandardError), err: write_message("If you want to read the citation data from file set up \ the file_for_citations parameter in the config file [%s]" %err, verbose=2) cit, dict_of_ids = get_citations_from_db() len_ = len(dict_of_ids.keys()) write_message("Number of nodes(papers) to rank : %s" % str(len_), verbose=3) if len_ == 0: write_message("No citation data found, nothing to be done.") return try: method = config.get(function, "citerank_method") except ConfigParser.NoOptionError, err: write_message("Exception: %s " %err, sys.stderr) raise Exception write_message("Running %s method." % method, verbose=2) dates = get_dates(function, config, dict_of_ids) if method == "citation_time": try: time_decay = float(config.get(function, "time_decay")) except (ConfigParser.NoOptionError, ValueError), err: write_message("Exception: %s" % err, sys.stderr) raise Exception date_coef = calculate_time_weights(len_, time_decay, dates) #cit = remove_loops(cit, dates, dict_of_ids) dict_of_ranks = \ run_citation_rank_time(cit, dict_of_ids, date_coef, dates) else: try: conv_threshold = float(config.get(function, "conv_threshold")) check_point = int(config.get(function, "check_point")) damping_factor = float(config.get(function, "damping_factor")) write_message("Parameters: d = %s, conv_threshold = %s, \ check_point = %s" %(str(damping_factor), \ str(conv_threshold), str(check_point)), verbose=5) except (ConfigParser.NoOptionError, StandardError), err: write_message("Exception: %s" % err, sys.stderr) raise Exception if method == "pagerank_classic": ref = construct_ref_array(cit, dict_of_ids, len_) use_ext_cit = "" try: use_ext_cit = config.get(function, "use_external_citations") write_message("Pagerank will use external citations: %s" \ %str(use_ext_cit), verbose=5) except (ConfigParser.NoOptionError, StandardError), err: write_message("%s" % err, verbose=2) if use_ext_cit == "yes": try: ext_citation_file = config.get(function, "ext_citation_file") ext_links = get_external_links_from_file(ext_citation_file, ref, dict_of_ids) except (ConfigParser.NoOptionError, StandardError): write_message("If you want to read the external citation \ data from file set up the ext_citation_file parameter in the config. file", \ verbose=3) try: reference_tag = config.get(function, "ext_reference_tag") dummy = int(reference_tag[0:3]) except (ConfigParser.NoOptionError, StandardError): write_message("You need to set up correctly the \ reference_tag in the cfg file", sys.stderr) raise Exception ext_links = get_external_links_from_db(ref, \ dict_of_ids, reference_tag) avg = avg_ext_links_with_0(ext_links) if avg < 1: write_message("This method can't be ran. There is not \ enough information about the external citation. Hint: check the reference tag", \ sys.stderr) raise Exception avg_ext_links_without_0(ext_links) try: alpha = float(config.get(function, "ext_alpha")) beta = float(config.get(function, "ext_beta")) except (ConfigParser.NoOptionError, StandardError), err: write_message("Exception: %s" % err, sys.stderr) raise Exception dict_of_ranks = run_pagerank_ext(cit, dict_of_ids, ref, \ ext_links, conv_threshold, check_point, alpha, beta, dates) else: dict_of_ranks = run_pagerank(cit, dict_of_ids, len_, ref, \ damping_factor, conv_threshold, check_point, dates) elif method == "pagerank_time": try: time_decay = float(config.get(function, "time_decay")) write_message("Parameter: time_decay = %s" \ %str(time_decay), verbose=5) except (ConfigParser.NoOptionError, StandardError), err: write_message("Exception: %s" % err, sys.stderr) raise Exception date_coef = calculate_time_weights(len_, time_decay, dates) cit = remove_loops(cit, dates, dict_of_ids) ref = construct_ref_array(cit, dict_of_ids, len_) dict_of_ranks = run_pagerank_time(cit, dict_of_ids, len_, ref, \ damping_factor, conv_threshold, check_point, date_coef, dates) else: write_message("Error: Unknown ranking method. \ Please check the ranking_method parameter in the config. file.", sys.stderr) raise Exception try: filename_ranks = config.get(function, "output_ranks_to_filename") max_ranks = config.get(function, "output_rank_limit") if not max_ranks.isdigit(): max_ranks = len_ else: max_ranks = int(max_ranks) if max_ranks > len_: max_ranks = len_ ranks = sort_weights(dict_of_ranks) write_message("Ranks: %s" % str(ranks), verbose=9) write_first_ranks_to_file(ranks, dict_of_ranks, \ max_ranks, filename_ranks) except (ConfigParser.NoOptionError, StandardError): write_message("If you want the ranks to be printed in a file you have \ to set output_ranks_to_filename and output_rank_limit \ parameters in the configuration file", verbose=3) normalize_weights(dict_of_ranks) into_db(dict_of_ranks, rank_method_code) diff --git a/modules/bibrank/lib/bibrank_tag_based_indexer.py b/modules/bibrank/lib/bibrank_tag_based_indexer.py index 239795a45..1ef052bbe 100644 --- a/modules/bibrank/lib/bibrank_tag_based_indexer.py +++ b/modules/bibrank/lib/bibrank_tag_based_indexer.py @@ -1,504 +1,504 @@ # -*- coding: utf-8 -*- # Ranking of records using different parameters and methods. # This file is part of Invenio. -# Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2011, 2012 CERN. +# Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2011, 2012, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. import sys import time import traceback import ConfigParser from invenio.config import \ CFG_SITE_LANG, \ CFG_ETCDIR from invenio.search_engine import perform_request_search from invenio.bibrank_citation_indexer import get_citation_weight, print_missing from invenio.bibrank_downloads_indexer import * from invenio.dbquery import run_sql, serialize_via_marshal, deserialize_via_marshal, \ wash_table_column_name, get_table_update_time from invenio.bibtask import task_get_option, write_message, task_sleep_now_if_required from invenio.bibindex_engine import create_range_list from invenio.intbitset import intbitset options = {} def download_weight_filtering_user_repair_exec (): """Repair download weight filtering user ranking method""" write_message("Repairing for this ranking method is not defined. Skipping.") return def download_weight_total_repair_exec(): """Repair download weight total ranking method""" write_message("Repairing for this ranking method is not defined. Skipping.") return def file_similarity_by_times_downloaded_repair_exec(): """Repair file similarity by times downloaded ranking method""" write_message("Repairing for this ranking method is not defined. Skipping.") return def single_tag_rank_method_repair_exec(): """Repair single tag ranking method""" write_message("Repairing for this ranking method is not defined. Skipping.") return def citation_exec(rank_method_code, name, config): """Rank method for citation analysis""" #first check if this is a specific task if task_get_option("cmd") == "print-missing": num = task_get_option("num") print_missing(num) else: dic, index_update_time = get_citation_weight(rank_method_code, config) if dic: if task_get_option("id") or task_get_option("collection") or \ task_get_option("modified"): # user have asked to citation-index specific records # only, so we should not update citation indexer's # last run time stamp information index_update_time = None intoDB(dic, index_update_time, rank_method_code) else: write_message("No need to update the indexes for citations.") def download_weight_filtering_user(run): return bibrank_engine(run) def download_weight_total(run): return bibrank_engine(run) def file_similarity_by_times_downloaded(run): return bibrank_engine(run) def download_weight_filtering_user_exec (rank_method_code, name, config): """Ranking by number of downloads per User. Only one full Text Download is taken in account for one specific userIP address""" begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time1 = time.time() dic = fromDB(rank_method_code) last_updated = get_lastupdated(rank_method_code) keys = new_downloads_to_index(last_updated) filter_downloads_per_hour(keys, last_updated) dic = get_download_weight_filtering_user(dic, keys) intoDB(dic, begin_date, rank_method_code) time2 = time.time() return {"time":time2-time1} def download_weight_total_exec(rank_method_code, name, config): """rankink by total number of downloads without check the user ip if users downloads 3 time the same full text document it has to be count as 3 downloads""" begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time1 = time.time() dic = fromDB(rank_method_code) last_updated = get_lastupdated(rank_method_code) keys = new_downloads_to_index(last_updated) filter_downloads_per_hour(keys, last_updated) dic = get_download_weight_total(dic, keys) intoDB(dic, begin_date, rank_method_code) time2 = time.time() return {"time":time2-time1} def file_similarity_by_times_downloaded_exec(rank_method_code, name, config): """update dictionnary {recid:[(recid, nb page similarity), ()..]}""" begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time1 = time.time() dic = fromDB(rank_method_code) last_updated = get_lastupdated(rank_method_code) keys = new_downloads_to_index(last_updated) filter_downloads_per_hour(keys, last_updated) dic = get_file_similarity_by_times_downloaded(dic, keys) intoDB(dic, begin_date, rank_method_code) time2 = time.time() return {"time":time2-time1} def single_tag_rank_method_exec(rank_method_code, name, config): """Creating the rank method data""" begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) rnkset = {} rnkset_old = fromDB(rank_method_code) rnkset_new = single_tag_rank(config) rnkset = union_dicts(rnkset_old, rnkset_new) intoDB(rnkset, begin_date, rank_method_code) def single_tag_rank(config): """Connect the given tag with the data from the kb file given""" write_message("Loading knowledgebase file", verbose=9) kb_data = {} records = [] write_message("Reading knowledgebase file: %s" % \ config.get(config.get("rank_method", "function"), "kb_src")) input = open(config.get(config.get("rank_method", "function"), "kb_src"), 'r') data = input.readlines() for line in data: if not line[0:1] == "#": kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1] write_message("Number of lines read from knowledgebase file: %s" % len(kb_data)) tag = config.get(config.get("rank_method", "function"), "tag") tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ") if tags == ['']: tags = "" records = [] for (recids, recide) in options["recid_range"]: task_sleep_now_if_required(can_stop_too=True) write_message("......Processing records #%s-%s" % (recids, recide)) recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide)) valid = intbitset(trailing_bits=1) valid.discard(0) for key in tags: newset = intbitset() newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))] valid.intersection_update(newset) if tags: recs = filter(lambda x: x[0] in valid, recs) records = records + list(recs) write_message("Number of records found with the necessary tags: %s" % len(records)) records = filter(lambda x: x[0] in options["validset"], records) rnkset = {} for key, value in records: if kb_data.has_key(value): if not rnkset.has_key(key): rnkset[key] = float(kb_data[value]) else: if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]): rnkset[key] = float(kb_data[value]) else: rnkset[key] = 0 write_message("Number of records available in rank method: %s" % len(rnkset)) return rnkset def get_lastupdated(rank_method_code): """Get the last time the rank method was updated""" res = run_sql("SELECT rnkMETHOD.last_updated FROM rnkMETHOD WHERE name=%s", (rank_method_code, )) if res: return res[0][0] else: # raise Exception("Is this the first run? Please do a complete update.") return "1970-01-01 00:00:00" def intoDB(dic, date, rank_method_code): """Insert the rank method data into the database""" mid = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) del_rank_method_codeDATA(rank_method_code) serdata = serialize_via_marshal(dic) midstr = str(mid[0][0]) - run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) VALUES (%s,%s)", (midstr, serdata,)) + run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) VALUES (%s,_binary %s)", (midstr, serdata,)) if date: run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s", (date, rank_method_code)) def fromDB(rank_method_code): """Get the data for a rank method""" id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) if not id: return {} res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], )) if res: return deserialize_via_marshal(res[0][0]) else: return {} def del_rank_method_codeDATA(rank_method_code): """Delete the data for a rank method""" id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) run_sql("DELETE FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], )) def del_recids(rank_method_code, range_rec): """Delete some records from the rank method""" id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], )) if res: rec_dict = deserialize_via_marshal(res[0][0]) write_message("Old size: %s" % len(rec_dict)) for (recids, recide) in range_rec: for i in range(int(recids), int(recide)): if rec_dict.has_key(i): del rec_dict[i] write_message("New size: %s" % len(rec_dict)) begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) intoDB(rec_dict, begin_date, rank_method_code) else: write_message("Create before deleting!") def union_dicts(dict1, dict2): "Returns union of the two dicts." union_dict = {} for (key, value) in dict1.iteritems(): union_dict[key] = value for (key, value) in dict2.iteritems(): union_dict[key] = value return union_dict def rank_method_code_statistics(rank_method_code): """Print statistics""" method = fromDB(rank_method_code) max = ('', -999999) maxcount = 0 min = ('', 999999) mincount = 0 for (recID, value) in method.iteritems(): if value < min and value > 0: min = value if value > max: max = value for (recID, value) in method.iteritems(): if value == min: mincount += 1 if value == max: maxcount += 1 write_message("Showing statistic for selected method") write_message("Method name: %s" % getName(rank_method_code)) write_message("Short name: %s" % rank_method_code) write_message("Last run: %s" % get_lastupdated(rank_method_code)) write_message("Number of records: %s" % len(method)) write_message("Lowest value: %s - Number of records: %s" % (min, mincount)) write_message("Highest value: %s - Number of records: %s" % (max, maxcount)) write_message("Divided into 10 sets:") for i in range(1, 11): setcount = 0 distinct_values = {} lower = -1.0 + ((float(max + 1) / 10)) * (i - 1) upper = -1.0 + ((float(max + 1) / 10)) * i for (recID, value) in method.iteritems(): if value >= lower and value <= upper: setcount += 1 distinct_values[value] = 1 write_message("Set %s (%s-%s) %s Distinct values: %s" % (i, lower, upper, len(distinct_values), setcount)) def check_method(rank_method_code): write_message("Checking rank method...") if len(fromDB(rank_method_code)) == 0: write_message("Rank method not yet executed, please run it to create the necessary data.") else: if len(add_recIDs_by_date(rank_method_code)) > 0: write_message("Records modified, update recommended") else: write_message("No records modified, update not necessary") def load_config(method): filename = CFG_ETCDIR + "/bibrank/" + method + ".cfg" config = ConfigParser.ConfigParser() try: config.readfp(open(filename)) except StandardError: write_message("Cannot find configuration file: %s" % filename, sys.stderr) raise return config def bibrank_engine(run): """Run the indexing task. Return 1 in case of success and 0 in case of failure. """ startCreate = time.time() options["run"] = [] options["run"].append(run) for rank_method_code in options["run"]: task_sleep_now_if_required(can_stop_too=True) cfg_name = getName(rank_method_code) write_message("Running rank method: %s." % cfg_name) config = load_config(rank_method_code) cfg_short = rank_method_code cfg_function = config.get("rank_method", "function") + "_exec" cfg_repair_function = config.get("rank_method", "function") + "_repair_exec" cfg_name = getName(cfg_short) options["validset"] = get_valid_range(rank_method_code) if task_get_option("collection"): l_of_colls = string.split(task_get_option("collection"), ", ") recIDs = perform_request_search(c=l_of_colls) recIDs_range = [] for recID in recIDs: recIDs_range.append([recID, recID]) options["recid_range"] = recIDs_range elif task_get_option("id"): options["recid_range"] = task_get_option("id") elif task_get_option("modified"): options["recid_range"] = add_recIDs_by_date(rank_method_code, task_get_option("modified")) elif task_get_option("last_updated"): options["recid_range"] = add_recIDs_by_date(rank_method_code) else: write_message("No records specified, updating all", verbose=2) min_id = run_sql("SELECT min(id) from bibrec")[0][0] max_id = run_sql("SELECT max(id) from bibrec")[0][0] options["recid_range"] = [[min_id, max_id]] if task_get_option("quick") == "no": write_message("Recalculate parameter not used, parameter ignored.", verbose=9) if task_get_option("cmd") == "del": del_recids(cfg_short, options["recid_range"]) elif task_get_option("cmd") == "add": func_object = globals().get(cfg_function) func_object(rank_method_code, cfg_name, config) elif task_get_option("cmd") == "stat": rank_method_code_statistics(rank_method_code) elif task_get_option("cmd") == "check": check_method(rank_method_code) elif task_get_option("cmd") == "print-missing": func_object = globals().get(cfg_function) func_object(rank_method_code, cfg_name, config) elif task_get_option("cmd") == "repair": func_object = globals().get(cfg_repair_function) func_object() else: write_message("Invalid command found processing %s" % rank_method_code, sys.stderr) raise StandardError if task_get_option("verbose"): showtime((time.time() - startCreate)) return 1 def get_valid_range(rank_method_code): """Return a range of records""" write_message("Getting records from collections enabled for rank method.", verbose=9) res = run_sql("SELECT collection.name FROM collection, collection_rnkMETHOD, rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name=%s", (rank_method_code, )) l_of_colls = [] for coll in res: l_of_colls.append(coll[0]) if len(l_of_colls) > 0: recIDs = perform_request_search(c=l_of_colls) else: recIDs = [] valid = intbitset() valid += recIDs return valid def add_recIDs_by_date(rank_method_code, dates=""): """Return recID range from records modified between DATES[0] and DATES[1]. If DATES is not set, then add records modified since the last run of the ranking method RANK_METHOD_CODE. """ if not dates: dates = (get_lastupdated(rank_method_code), '') if dates[0] is None: dates = ("0000-00-00 00:00:00", '') query = """SELECT b.id FROM bibrec AS b WHERE b.modification_date >= %s""" if dates[1]: query += " and b.modification_date <= %s" query += " ORDER BY b.id ASC""" if dates[1]: res = run_sql(query, (dates[0], dates[1])) else: res = run_sql(query, (dates[0], )) alist = create_range_list([row[0] for row in res]) if not alist: write_message("No new records added since last time method was run") return alist def getName(rank_method_code, ln=CFG_SITE_LANG, type='ln'): """Returns the name of the method if it exists""" try: rnkid = run_sql("SELECT id FROM rnkMETHOD where name=%s", (rank_method_code, )) if rnkid: rnkid = str(rnkid[0][0]) res = run_sql("SELECT value FROM rnkMETHODNAME where type=%s and ln=%s and id_rnkMETHOD=%s", (type, ln, rnkid)) if not res: res = run_sql("SELECT value FROM rnkMETHODNAME WHERE ln=%s and id_rnkMETHOD=%s and type=%s", (CFG_SITE_LANG, rnkid, type)) if not res: return rank_method_code return res[0][0] else: raise Exception except Exception: write_message("Cannot run rank method, either given code for method is wrong, or it has not been added using the webinterface.") raise Exception def single_tag_rank_method(run): return bibrank_engine(run) def showtime(timeused): """Show time used for method""" write_message("Time used: %d second(s)." % timeused, verbose=9) def citation(run): return bibrank_engine(run) # Hack to put index based sorting here, but this is very similar to tag #based method and should re-use a lot of this code, so better to have here #than separate # def index_term_count_exec(rank_method_code, name, config): """Creating the rank method data""" write_message("Recreating index weighting data") begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # we must recalculate these every time for all records, since the # weighting of a record is determined by the index entries of _other_ # records rnkset = calculate_index_term_count(config) intoDB(rnkset, begin_date, rank_method_code) def calculate_index_term_count(config): """Calculate the weight of a record set based on number of enries of a tag from the record in another index...useful for authority files""" records = [] if config.has_section("index_term_count"): index = config.get("index_term_count","index_table_name") tag = config.get("index_term_count","index_term_value_from_tag") # check against possible SQL injection: dummy = get_table_update_time(index) tag = wash_table_column_name(tag) else: raise Exception("Config file " + config + " does not have index_term_count section") return() task_sleep_now_if_required(can_stop_too=True) write_message("......Processing all records") query = "SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id" % \ (tag[0:2], tag[0:2]) # we checked that tag is safe records = list(run_sql(query, (tag,))) write_message("Number of records found with the necessary tags: %s" % len(records)) rnkset = {} for key, value in records: hits = 0 if len(value): query = "SELECT hitlist from %s where term = %%s" % index # we checked that index is a table row = run_sql(query, (value,)) if row and row[0] and row[0][0]: #has to be prepared for corrupted data! try: hits = len(intbitset(row[0][0])) except: hits = 0 rnkset[key] = hits write_message("Number of records available in rank method: %s" % len(rnkset)) return rnkset def index_term_count(run): return bibrank_engine(run) diff --git a/modules/bibrank/lib/bibrank_word_indexer.py b/modules/bibrank/lib/bibrank_word_indexer.py index df252476f..5ee0cfe5b 100644 --- a/modules/bibrank/lib/bibrank_word_indexer.py +++ b/modules/bibrank/lib/bibrank_word_indexer.py @@ -1,1184 +1,1184 @@ # This file is part of Invenio. -# Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2011, 2014 CERN. +# Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2011, 2014, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. __revision__ = "$Id$" import sys import time import urllib import math import re import ConfigParser from invenio.config import \ CFG_SITE_LANG, \ CFG_ETCDIR, \ CFG_SITE_URL from invenio.search_engine import perform_request_search, wash_index_term from invenio.dbquery import run_sql, DatabaseError, serialize_via_marshal, deserialize_via_marshal from invenio.bibindex_engine_stemmer import is_stemmer_available_for_language, stem from invenio.bibindex_engine_stopwords import is_stopword from invenio.bibindex_engine import beautify_range_list, \ kill_sleepy_mysql_threads, create_range_list from invenio.bibtask import write_message, task_get_option, task_update_progress, \ task_update_status, task_sleep_now_if_required from invenio.intbitset import intbitset from invenio.errorlib import register_exception from invenio.textutils import strip_accents options = {} # global variable to hold task options # safety parameters concerning DB thread-multiplication problem: CFG_CHECK_MYSQL_THREADS = 0 # to check or not to check the problem? CFG_MAX_MYSQL_THREADS = 50 # how many threads (connections) we consider as still safe CFG_MYSQL_THREAD_TIMEOUT = 20 # we'll kill threads that were sleeping for more than X seconds # override urllib's default password-asking behaviour: class MyFancyURLopener(urllib.FancyURLopener): def prompt_user_passwd(self, host, realm): # supply some dummy credentials by default return ("mysuperuser", "mysuperpass") def http_error_401(self, url, fp, errcode, errmsg, headers): # do not bother with protected pages raise IOError, (999, 'unauthorized access') return None #urllib._urlopener = MyFancyURLopener() nb_char_in_line = 50 # for verbose pretty printing chunksize = 1000 # default size of chunks that the records will be treated by base_process_size = 4500 # process base size # Dictionary merging functions def dict_union(list1, list2): "Returns union of the two dictionaries." union_dict = {} for (e, count) in list1.iteritems(): union_dict[e] = count for (e, count) in list2.iteritems(): if not union_dict.has_key(e): union_dict[e] = count else: union_dict[e] = (union_dict[e][0] + count[0], count[1]) #for (e, count) in list2.iteritems(): # list1[e] = (list1.get(e, (0, 0))[0] + count[0], count[1]) #return list1 return union_dict # tagToFunctions mapping. It offers an indirection level necesary for # indexing fulltext. The default is get_words_from_phrase tagToWordsFunctions = {} def get_words_from_phrase(phrase, weight, lang="", chars_punctuation=r"[\.\,\:\;\?\!\"]", chars_alphanumericseparators=r"[1234567890\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]", split=str.split): "Returns list of words from phrase 'phrase'." words = {} phrase = strip_accents(phrase) phrase = phrase.lower() #Getting rid of strange characters phrase = re.sub("é", 'e', phrase) phrase = re.sub("è", 'e', phrase) phrase = re.sub("à", 'a', phrase) phrase = re.sub(" ", ' ', phrase) phrase = re.sub("«", ' ', phrase) phrase = re.sub("»", ' ', phrase) phrase = re.sub("ê", ' ', phrase) phrase = re.sub("&", ' ', phrase) if phrase.find(" -1: #Most likely html, remove html code phrase = re.sub("(?s)<[^>]*>|&#?\w+;", ' ', phrase) #removes http links phrase = re.sub("(?s)http://[^( )]*", '', phrase) phrase = re.sub(chars_punctuation, ' ', phrase) #By doing this like below, characters standing alone, like c a b is not added to the inedx, but when they are together with characters like c++ or c$ they are added. for word in split(phrase): if options["remove_stopword"] == "True" and not is_stopword(word) and check_term(word, 0): if lang and lang !="none" and options["use_stemming"]: word = stem(word, lang) if not words.has_key(word): words[word] = (0, 0) else: if not words.has_key(word): words[word] = (0, 0) words[word] = (words[word][0] + weight, 0) elif options["remove_stopword"] == "True" and not is_stopword(word): phrase = re.sub(chars_alphanumericseparators, ' ', word) for word_ in split(phrase): if lang and lang !="none" and options["use_stemming"]: word_ = stem(word_, lang) if word_: if not words.has_key(word_): words[word_] = (0,0) words[word_] = (words[word_][0] + weight, 0) return words class WordTable: "A class to hold the words table." def __init__(self, tablename, fields_to_index, separators="[^\s]"): "Creates words table instance." self.tablename = tablename self.recIDs_in_mem = [] self.fields_to_index = fields_to_index self.separators = separators self.value = {} def get_field(self, recID, tag): """Returns list of values of the MARC-21 'tag' fields for the record 'recID'.""" out = [] bibXXx = "bib" + tag[0] + tag[1] + "x" bibrec_bibXXx = "bibrec_" + bibXXx query = """SELECT value FROM %s AS b, %s AS bb WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID, tag); res = run_sql(query) for row in res: out.append(row[0]) return out def clean(self): "Cleans the words table." self.value={} def put_into_db(self, mode="normal"): """Updates the current words table in the corresponding DB rnkWORD table. Mode 'normal' means normal execution, mode 'emergency' means words index reverting to old state. """ write_message("%s %s wordtable flush started" % (self.tablename,mode)) write_message('...updating %d words into %sR started' % \ (len(self.value), self.tablename[:-1])) task_update_progress("%s flushed %d/%d words" % (self.tablename, 0, len(self.value))) self.recIDs_in_mem = beautify_range_list(self.recIDs_in_mem) if mode == "normal": for group in self.recIDs_in_mem: query = """UPDATE %sR SET type='TEMPORARY' WHERE id_bibrec BETWEEN '%d' AND '%d' AND type='CURRENT'""" % \ (self.tablename[:-1], group[0], group[1]) write_message(query, verbose=9) run_sql(query) nb_words_total = len(self.value) nb_words_report = int(nb_words_total/10) nb_words_done = 0 for word in self.value.keys(): self.put_word_into_db(word, self.value[word]) nb_words_done += 1 if nb_words_report!=0 and ((nb_words_done % nb_words_report) == 0): write_message('......processed %d/%d words' % (nb_words_done, nb_words_total)) task_update_progress("%s flushed %d/%d words" % (self.tablename, nb_words_done, nb_words_total)) write_message('...updating %d words into %s ended' % \ (nb_words_total, self.tablename), verbose=9) #if options["verbose"]: # write_message('...updating reverse table %sR started' % self.tablename[:-1]) if mode == "normal": for group in self.recIDs_in_mem: query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \ (self.tablename[:-1], group[0], group[1]) write_message(query, verbose=9) run_sql(query) query = """DELETE FROM %sR WHERE id_bibrec BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \ (self.tablename[:-1], group[0], group[1]) write_message(query, verbose=9) run_sql(query) write_message('End of updating wordTable into %s' % self.tablename, verbose=9) elif mode == "emergency": write_message("emergency") for group in self.recIDs_in_mem: query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \ (self.tablename[:-1], group[0], group[1]) write_message(query, verbose=9) run_sql(query) query = """DELETE FROM %sR WHERE id_bibrec BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \ (self.tablename[:-1], group[0], group[1]) write_message(query, verbose=9) run_sql(query) write_message('End of emergency flushing wordTable into %s' % self.tablename, verbose=9) #if options["verbose"]: # write_message('...updating reverse table %sR ended' % self.tablename[:-1]) self.clean() self.recIDs_in_mem = [] write_message("%s %s wordtable flush ended" % (self.tablename, mode)) task_update_progress("%s flush ended" % (self.tablename)) def load_old_recIDs(self,word): """Load existing hitlist for the word from the database index files.""" query = "SELECT hitlist FROM %s WHERE term=%%s" % self.tablename res = run_sql(query, (word,)) if res: return deserialize_via_marshal(res[0][0]) else: return None def merge_with_old_recIDs(self,word,recIDs, set): """Merge the system numbers stored in memory (hash of recIDs with value[0] > 0 or -1 according to whether to add/delete them) with those stored in the database index and received in set universe of recIDs for the given word. Return 0 in case no change was done to SET, return 1 in case SET was changed. """ set_changed_p = 0 for recID,sign in recIDs.iteritems(): if sign[0] == -1 and set.has_key(recID): # delete recID if existent in set and if marked as to be deleted del set[recID] set_changed_p = 1 elif sign[0] > -1 and not set.has_key(recID): # add recID if not existent in set and if marked as to be added set[recID] = sign set_changed_p = 1 elif sign[0] > -1 and sign[0] != set[recID][0]: set[recID] = sign set_changed_p = 1 return set_changed_p def put_word_into_db(self, word, recIDs, split=str.split): """Flush a single word to the database and delete it from memory""" set = self.load_old_recIDs(word) #write_message("%s %s" % (word, self.value[word])) if set is not None: # merge the word recIDs found in memory: options["modified_words"][word] = 1 if not self.merge_with_old_recIDs(word, recIDs, set): # nothing to update: write_message("......... unchanged hitlist for ``%s''" % word, verbose=9) pass else: # yes there were some new words: write_message("......... updating hitlist for ``%s''" % word, verbose=9) - run_sql("UPDATE %s SET hitlist=%%s WHERE term=%%s" % self.tablename, + run_sql("UPDATE %s SET hitlist=_binary %%s WHERE term=%%s" % self.tablename, (serialize_via_marshal(set), word)) else: # the word is new, will create new set: write_message("......... inserting hitlist for ``%s''" % word, verbose=9) set = self.value[word] if len(set) > 0: #new word, add to list options["modified_words"][word] = 1 try: - run_sql("INSERT INTO %s (term, hitlist) VALUES (%%s, %%s)" % self.tablename, + run_sql("INSERT INTO %s (term, hitlist) VALUES (%%s, _binary %%s)" % self.tablename, (word, serialize_via_marshal(set))) except Exception, e: ## FIXME: This is for debugging encoding errors register_exception(prefix="Error when putting the term '%s' into db (hitlist=%s): %s\n" % (repr(word), set, e), alert_admin=True) if not set: # never store empty words run_sql("DELETE from %s WHERE term=%%s" % self.tablename, (word,)) del self.value[word] def display(self): "Displays the word table." keys = self.value.keys() keys.sort() for k in keys: write_message("%s: %s" % (k, self.value[k])) def count(self): "Returns the number of words in the table." return len(self.value) def info(self): "Prints some information on the words table." write_message("The words table contains %d words." % self.count()) def lookup_words(self, word=""): "Lookup word from the words table." if not word: done = 0 while not done: try: word = raw_input("Enter word: ") done = 1 except (EOFError, KeyboardInterrupt): return if self.value.has_key(word): write_message("The word '%s' is found %d times." \ % (word, len(self.value[word]))) else: write_message("The word '%s' does not exist in the word file."\ % word) def update_last_updated(self, rank_method_code, starting_time=None): """Update last_updated column of the index table in the database. Puts starting time there so that if the task was interrupted for record download, the records will be reindexed next time.""" if starting_time is None: return None write_message("updating last_updated to %s..." % starting_time, verbose=9) return run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s", (starting_time, rank_method_code,)) def add_recIDs(self, recIDs): """Fetches records which id in the recIDs arange list and adds them to the wordTable. The recIDs arange list is of the form: [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]]. """ global chunksize flush_count = 0 records_done = 0 records_to_go = 0 for arange in recIDs: records_to_go = records_to_go + arange[1] - arange[0] + 1 time_started = time.time() # will measure profile time for arange in recIDs: i_low = arange[0] chunksize_count = 0 while i_low <= arange[1]: # calculate chunk group of recIDs and treat it: i_high = min(i_low+task_get_option("flush")-flush_count-1,arange[1]) i_high = min(i_low+chunksize-chunksize_count-1, i_high) try: self.chk_recID_range(i_low, i_high) except StandardError, e: write_message("Exception caught: %s" % e, sys.stderr) register_exception() task_update_status("ERROR") sys.exit(1) write_message("%s adding records #%d-#%d started" % \ (self.tablename, i_low, i_high)) if CFG_CHECK_MYSQL_THREADS: kill_sleepy_mysql_threads() task_update_progress("%s adding recs %d-%d" % (self.tablename, i_low, i_high)) self.del_recID_range(i_low, i_high) just_processed = self.add_recID_range(i_low, i_high) flush_count = flush_count + i_high - i_low + 1 chunksize_count = chunksize_count + i_high - i_low + 1 records_done = records_done + just_processed write_message("%s adding records #%d-#%d ended " % \ (self.tablename, i_low, i_high)) if chunksize_count >= chunksize: chunksize_count = 0 # flush if necessary: if flush_count >= task_get_option("flush"): self.put_into_db() self.clean() write_message("%s backing up" % (self.tablename)) flush_count = 0 self.log_progress(time_started,records_done,records_to_go) # iterate: i_low = i_high + 1 if flush_count > 0: self.put_into_db() self.log_progress(time_started,records_done,records_to_go) def add_recIDs_by_date(self, dates=""): """Add recIDs modified between DATES[0] and DATES[1]. If DATES is not set, then add records modified since the last run of the ranking method. """ if not dates: write_message("Using the last update time for the rank method") query = """SELECT last_updated FROM rnkMETHOD WHERE name='%s' """ % options["current_run"] res = run_sql(query) if not res: return if not res[0][0]: dates = ("0000-00-00",'') else: dates = (res[0][0],'') query = """SELECT b.id FROM bibrec AS b WHERE b.modification_date >= '%s'""" % dates[0] if dates[1]: query += "and b.modification_date <= '%s'" % dates[1] query += " ORDER BY b.id ASC""" res = run_sql(query) alist = create_range_list([row[0] for row in res]) if not alist: write_message( "No new records added. %s is up to date" % self.tablename) else: self.add_recIDs(alist) return alist def add_recID_range(self, recID1, recID2): """Add records from RECID1 to RECID2.""" wlist = {} normalize = {} self.recIDs_in_mem.append([recID1,recID2]) # secondly fetch all needed tags: for (tag, weight, lang) in self.fields_to_index: if tag in tagToWordsFunctions.keys(): get_words_function = tagToWordsFunctions[tag] else: get_words_function = get_words_from_phrase bibXXx = "bib" + tag[0] + tag[1] + "x" bibrec_bibXXx = "bibrec_" + bibXXx query = """SELECT bb.id_bibrec,b.value FROM %s AS b, %s AS bb WHERE bb.id_bibrec BETWEEN %d AND %d AND bb.id_bibxxx=b.id AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID1, recID2, tag) res = run_sql(query) nb_total_to_read = len(res) verbose_idx = 0 # for verbose pretty printing for row in res: recID, phrase = row if recID in options["validset"]: if not wlist.has_key(recID): wlist[recID] = {} new_words = get_words_function(phrase, weight, lang) # ,self.separators wlist[recID] = dict_union(new_words,wlist[recID]) # were there some words for these recIDs found? if len(wlist) == 0: return 0 recIDs = wlist.keys() for recID in recIDs: # was this record marked as deleted? if "DELETED" in self.get_field(recID, "980__c"): wlist[recID] = {} write_message("... record %d was declared deleted, removing its word list" % recID, verbose=9) write_message("... record %d, termlist: %s" % (recID, wlist[recID]), verbose=9) # put words into reverse index table with FUTURE status: for recID in recIDs: - run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,%%s,'FUTURE')" % self.tablename[:-1], + run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'FUTURE')" % self.tablename[:-1], (recID, serialize_via_marshal(wlist[recID]))) # ... and, for new records, enter the CURRENT status as empty: try: - run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,%%s,'CURRENT')" % self.tablename[:-1], + run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'CURRENT')" % self.tablename[:-1], (recID, serialize_via_marshal([]))) except DatabaseError: # okay, it's an already existing record, no problem pass # put words into memory word list: put = self.put for recID in recIDs: for (w, count) in wlist[recID].iteritems(): put(recID, w, count) return len(recIDs) def log_progress(self, start, done, todo): """Calculate progress and store it. start: start time, done: records processed, todo: total number of records""" time_elapsed = time.time() - start # consistency check if time_elapsed == 0 or done > todo: return time_recs_per_min = done/(time_elapsed/60.0) write_message("%d records took %.1f seconds to complete.(%1.f recs/min)"\ % (done, time_elapsed, time_recs_per_min)) if time_recs_per_min: write_message("Estimated runtime: %.1f minutes" % \ ((todo-done)/time_recs_per_min)) def put(self, recID, word, sign): "Adds/deletes a word to the word list." try: word = wash_index_term(word) if self.value.has_key(word): # the word 'word' exist already: update sign self.value[word][recID] = sign # PROBLEM ? else: self.value[word] = {recID: sign} except: write_message("Error: Cannot put word %s with sign %d for recID %s." % (word, sign, recID)) def del_recIDs(self, recIDs): """Fetches records which id in the recIDs range list and adds them to the wordTable. The recIDs range list is of the form: [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]]. """ count = 0 for range in recIDs: self.del_recID_range(range[0],range[1]) count = count + range[1] - range[0] self.put_into_db() def del_recID_range(self, low, high): """Deletes records with 'recID' system number between low and high from memory words index table.""" write_message("%s fetching existing words for records #%d-#%d started" % \ (self.tablename, low, high), verbose=3) self.recIDs_in_mem.append([low,high]) query = """SELECT id_bibrec,termlist FROM %sR as bb WHERE bb.id_bibrec BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high) recID_rows = run_sql(query) for recID_row in recID_rows: recID = recID_row[0] wlist = deserialize_via_marshal(recID_row[1]) for word in wlist: self.put(recID, word, (-1, 0)) write_message("%s fetching existing words for records #%d-#%d ended" % \ (self.tablename, low, high), verbose=3) def check_bad_words(self): """ Finds bad words in reverse tables. Returns the number of bad words. """ query = """SELECT count(1) FROM %sR WHERE type IN ('TEMPORARY','FUTURE')""" % (self.tablename[:-1]) res = run_sql(query) return res[0][0] def report_on_table_consistency(self): """Check reverse words index tables (e.g. rnkWORD01R) for interesting states such as 'TEMPORARY' state. Prints small report (no of words, no of bad words). """ # find number of words: query = """SELECT COUNT(*) FROM %s""" % (self.tablename) res = run_sql(query, None, 1) if res: nb_words = res[0][0] else: nb_words = 0 # report stats: write_message("%s contains %d words" % (self.tablename, nb_words)) # find possible bad states in reverse tables: nb_bad_words = self.check_bad_words() if nb_bad_words: write_message("EMERGENCY: %s needs to repair %d of %d index records" % (self.tablename, nb_bad_words, nb_words)) else: write_message("%s is in consistent state" % (self.tablename)) def repair(self): """Repair the whole table""" # find possible bad states in reverse tables: if self.check_bad_words() == 0: return query = """SELECT id_bibrec FROM %sR WHERE type in ('TEMPORARY','FUTURE')""" \ % (self.tablename[:-1]) res = intbitset(run_sql(query)) recIDs = create_range_list(list(res)) flush_count = 0 records_done = 0 records_to_go = 0 for range in recIDs: records_to_go = records_to_go + range[1] - range[0] + 1 time_started = time.time() # will measure profile time for range in recIDs: i_low = range[0] chunksize_count = 0 while i_low <= range[1]: # calculate chunk group of recIDs and treat it: i_high = min(i_low+task_get_option("flush")-flush_count-1,range[1]) i_high = min(i_low+chunksize-chunksize_count-1, i_high) try: self.fix_recID_range(i_low, i_high) except StandardError, e: write_message("Exception caught: %s" % e, sys.stderr) register_exception() task_update_status("ERROR") sys.exit(1) flush_count = flush_count + i_high - i_low + 1 chunksize_count = chunksize_count + i_high - i_low + 1 records_done = records_done + i_high - i_low + 1 if chunksize_count >= chunksize: chunksize_count = 0 # flush if necessary: if flush_count >= task_get_option("flush"): self.put_into_db("emergency") self.clean() flush_count = 0 self.log_progress(time_started,records_done,records_to_go) # iterate: i_low = i_high + 1 if flush_count > 0: self.put_into_db("emergency") self.log_progress(time_started,records_done,records_to_go) write_message("%s inconsistencies repaired." % self.tablename) def chk_recID_range(self, low, high): """Check if the reverse index table is in proper state""" ## check db query = """SELECT COUNT(*) FROM %sR WHERE type <> 'CURRENT' AND id_bibrec BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high) res = run_sql(query, None, 1) if res[0][0]==0: write_message("%s for %d-%d is in consistent state"%(self.tablename,low,high)) return # okay, words table is consistent ## inconsistency detected! write_message("EMERGENCY: %s inconsistencies detected..." % self.tablename) write_message("""EMERGENCY: Errors found. You should check consistency of the %s - %sR tables.\nRunning 'bibrank --repair' is recommended.""" \ % (self.tablename, self.tablename[:-1])) raise StandardError def fix_recID_range(self, low, high): """Try to fix reverse index database consistency (e.g. table rnkWORD01R) in the low,high doc-id range. Possible states for a recID follow: CUR TMP FUT: very bad things have happened: warn! CUR TMP : very bad things have happened: warn! CUR FUT: delete FUT (crash before flushing) CUR : database is ok TMP FUT: add TMP to memory and del FUT from memory flush (revert to old state) TMP : very bad things have happened: warn! FUT: very bad things have happended: warn! """ state = {} query = "SELECT id_bibrec,type FROM %sR WHERE id_bibrec BETWEEN '%d' AND '%d'"\ % (self.tablename[:-1], low, high) res = run_sql(query) for row in res: if not state.has_key(row[0]): state[row[0]]=[] state[row[0]].append(row[1]) ok = 1 # will hold info on whether we will be able to repair for recID in state.keys(): if not 'TEMPORARY' in state[recID]: if 'FUTURE' in state[recID]: if 'CURRENT' not in state[recID]: write_message("EMERGENCY: Index record %d is in inconsistent state. Can't repair it" % recID) ok = 0 else: write_message("EMERGENCY: Inconsistency in index record %d detected" % recID) query = """DELETE FROM %sR WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID) run_sql(query) write_message("EMERGENCY: Inconsistency in index record %d repaired." % recID) else: if 'FUTURE' in state[recID] and not 'CURRENT' in state[recID]: self.recIDs_in_mem.append([recID,recID]) # Get the words file query = """SELECT type,termlist FROM %sR WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID) write_message(query, verbose=9) res = run_sql(query) for row in res: wlist = deserialize_via_marshal(row[1]) write_message("Words are %s " % wlist, verbose=9) if row[0] == 'TEMPORARY': sign = 1 else: sign = -1 for word in wlist: self.put(recID, word, wlist[word]) else: write_message("EMERGENCY: %s for %d is in inconsistent state. Couldn't repair it." % (self.tablename, recID)) ok = 0 if not ok: write_message("""EMERGENCY: Unrepairable errors found. You should check consistency of the %s - %sR tables. Deleting affected TEMPORARY and FUTURE entries from these tables is recommended; see the BibIndex Admin Guide. (The repairing procedure is similar for bibrank word indexes.)""" % (self.tablename, self.tablename[:-1])) raise StandardError def word_index(run): """Run the indexing task. The row argument is the BibSched task queue row, containing if, arguments, etc. Return 1 in case of success and 0 in case of failure. """ global languages max_recid = 0 res = run_sql("SELECT max(id) FROM bibrec") if res and res[0][0]: max_recid = int(res[0][0]) options["run"] = [] options["run"].append(run) for rank_method_code in options["run"]: task_sleep_now_if_required(can_stop_too=True) method_starting_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) write_message("Running rank method: %s" % getName(rank_method_code)) try: file = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg" config = ConfigParser.ConfigParser() config.readfp(open(file)) except StandardError, e: write_message("Cannot find configurationfile: %s" % file, sys.stderr) raise StandardError options["current_run"] = rank_method_code options["modified_words"] = {} options["table"] = config.get(config.get("rank_method", "function"), "table") options["use_stemming"] = config.get(config.get("rank_method","function"),"stemming") options["remove_stopword"] = config.get(config.get("rank_method","function"),"stopword") tags = get_tags(config) #get the tags to include options["validset"] = get_valid_range(rank_method_code) #get the records from the collections the method is enabled for function = config.get("rank_method","function") wordTable = WordTable(options["table"], tags) wordTable.report_on_table_consistency() try: if task_get_option("cmd") == "del": if task_get_option("id"): wordTable.del_recIDs(task_get_option("id")) task_sleep_now_if_required(can_stop_too=True) elif task_get_option("collection"): l_of_colls = task_get_option("collection").split(",") recIDs = perform_request_search(c=l_of_colls) recIDs_range = [] for recID in recIDs: recIDs_range.append([recID,recID]) wordTable.del_recIDs(recIDs_range) task_sleep_now_if_required(can_stop_too=True) else: write_message("Missing IDs of records to delete from index %s.", wordTable.tablename, sys.stderr) raise StandardError elif task_get_option("cmd") == "add": if task_get_option("id"): wordTable.add_recIDs(task_get_option("id")) task_sleep_now_if_required(can_stop_too=True) elif task_get_option("collection"): l_of_colls = task_get_option("collection").split(",") recIDs = perform_request_search(c=l_of_colls) recIDs_range = [] for recID in recIDs: recIDs_range.append([recID,recID]) wordTable.add_recIDs(recIDs_range) task_sleep_now_if_required(can_stop_too=True) elif task_get_option("last_updated"): wordTable.add_recIDs_by_date("") # only update last_updated if run via automatic mode: wordTable.update_last_updated(rank_method_code, method_starting_time) task_sleep_now_if_required(can_stop_too=True) elif task_get_option("modified"): wordTable.add_recIDs_by_date(task_get_option("modified")) task_sleep_now_if_required(can_stop_too=True) else: wordTable.add_recIDs([[0,max_recid]]) task_sleep_now_if_required(can_stop_too=True) elif task_get_option("cmd") == "repair": wordTable.repair() check_rnkWORD(options["table"]) task_sleep_now_if_required(can_stop_too=True) elif task_get_option("cmd") == "check": check_rnkWORD(options["table"]) options["modified_words"] = {} task_sleep_now_if_required(can_stop_too=True) elif task_get_option("cmd") == "stat": rank_method_code_statistics(options["table"]) task_sleep_now_if_required(can_stop_too=True) else: write_message("Invalid command found processing %s" % \ wordTable.tablename, sys.stderr) raise StandardError update_rnkWORD(options["table"], options["modified_words"]) task_sleep_now_if_required(can_stop_too=True) except StandardError, e: register_exception(alert_admin=True) write_message("Exception caught: %s" % e, sys.stderr) sys.exit(1) wordTable.report_on_table_consistency() # We are done. State it in the database, close and quit return 1 def get_tags(config): """Get the tags that should be used creating the index and each tag's parameter""" tags = [] function = config.get("rank_method","function") i = 1 shown_error = 0 #try: if 1: while config.has_option(function,"tag%s"% i): tag = config.get(function, "tag%s" % i) tag = tag.split(",") tag[1] = int(tag[1].strip()) tag[2] = tag[2].strip() #check if stemmer for language is available if config.get(function, "stemming") and stem("information", "en") != "inform": if shown_error == 0: write_message("Warning: Stemming not working. Please check it out!") shown_error = 1 elif tag[2] and tag[2] != "none" and config.get(function,"stemming") and not is_stemmer_available_for_language(tag[2]): write_message("Warning: Stemming not available for language '%s'." % tag[2]) tags.append(tag) i += 1 #except Exception: # write_message("Could not read data from configuration file, please check for errors") # raise StandardError return tags def get_valid_range(rank_method_code): """Returns which records are valid for this rank method, according to which collections it is enabled for.""" #if options["verbose"] >=9: # write_message("Getting records from collections enabled for rank method.") #res = run_sql("SELECT collection.name FROM collection,collection_rnkMETHOD,rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name='%s'" % rank_method_code) #l_of_colls = [] #for coll in res: # l_of_colls.append(coll[0]) #if len(l_of_colls) > 0: # recIDs = perform_request_search(c=l_of_colls) #else: # recIDs = [] valid = intbitset(trailing_bits=1) valid.discard(0) #valid.addlist(recIDs) return valid def check_term(term, termlength): """Check if term contains not allowed characters, or for any other reasons for not using this term.""" try: if len(term) <= termlength: return False reg = re.compile(r"[1234567890\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]") if re.search(reg, term): return False term = str.replace(term, "-", "") term = str.replace(term, ".", "") term = str.replace(term, ",", "") if int(term): return False except StandardError, e: pass return True def check_rnkWORD(table): """Checks for any problems in rnkWORD tables.""" i = 0 errors = {} termslist = run_sql("SELECT term FROM %s" % table) N = run_sql("select max(id_bibrec) from %sR" % table[:-1])[0][0] write_message("Checking integrity of rank values in %s" % table) terms = map(lambda x: x[0], termslist) while i < len(terms): query_params = () for j in range(i, ((i+5000)< len(terms) and (i+5000) or len(terms))): query_params += (terms[j],) terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term IN (%s)" % (table, (len(query_params)*"%s,")[:-1]), query_params) for (t, hitlist) in terms_docs: term_docs = deserialize_via_marshal(hitlist) if (term_docs.has_key("Gi") and term_docs["Gi"][1] == 0) or not term_docs.has_key("Gi"): write_message("ERROR: Missing value for term: %s (%s) in %s: %s" % (t, repr(t), table, len(term_docs))) errors[t] = 1 i += 5000 write_message("Checking integrity of rank values in %sR" % table[:-1]) i = 0 while i < N: docs_terms = run_sql("SELECT id_bibrec, termlist FROM %sR WHERE id_bibrec>=%s and id_bibrec<=%s" % (table[:-1], i, i+5000)) for (j, termlist) in docs_terms: termlist = deserialize_via_marshal(termlist) for (t, tf) in termlist.iteritems(): if tf[1] == 0 and not errors.has_key(t): errors[t] = 1 write_message("ERROR: Gi missing for record %s and term: %s (%s) in %s" % (j,t,repr(t), table)) terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term=%%s" % table, (t,)) termlist = deserialize_via_marshal(terms_docs[0][1]) i += 5000 if len(errors) == 0: write_message("No direct errors found, but nonconsistent data may exist.") else: write_message("%s errors found during integrity check, repair and rebalancing recommended." % len(errors)) options["modified_words"] = errors def rank_method_code_statistics(table): """Shows some statistics about this rank method.""" maxID = run_sql("select max(id) from %s" % table) maxID = maxID[0][0] terms = {} Gi = {} write_message("Showing statistics of terms in index:") write_message("Important: For the 'Least used terms', the number of terms is shown first, and the number of occurences second.") write_message("Least used terms---Most important terms---Least important terms") i = 0 while i < maxID: terms_docs=run_sql("SELECT term, hitlist FROM %s WHERE id>= %s and id < %s" % (table, i, i + 10000)) for (t, hitlist) in terms_docs: term_docs=deserialize_via_marshal(hitlist) terms[len(term_docs)] = terms.get(len(term_docs), 0) + 1 if term_docs.has_key("Gi"): Gi[t] = term_docs["Gi"] i=i + 10000 terms=terms.items() terms.sort(lambda x, y: cmp(y[1], x[1])) Gi=Gi.items() Gi.sort(lambda x, y: cmp(y[1], x[1])) for i in range(0, 20): write_message("%s/%s---%s---%s" % (terms[i][0],terms[i][1], Gi[i][0],Gi[len(Gi) - i - 1][0])) def update_rnkWORD(table, terms): """Updates rnkWORDF and rnkWORDR with Gi and Nj values. For each term in rnkWORDF, a Gi value for the term is added. And for each term in each document, the Nj value for that document is added. In rnkWORDR, the Gi value for each term in each document is added. For description on how things are computed, look in the hacking docs. table - name of forward index to update terms - modified terms""" zero_division_msg = """\ ERROR: %s captured. This might be caused by not enough balanced indexes. Please, schedule a regular, e.g. weekly, rebalancing of the word similarity ranking indexes, by using e.g. "bibrank -f50000 -R -wwrd -s14d -LSunday" as recommended in %s/help/admin/howto-run""" stime = time.time() Gi = {} Nj = {} N = run_sql("select count(id_bibrec) from %sR" % table[:-1])[0][0] if len(terms) == 0 and task_get_option("quick") == "yes": write_message("No terms to process, ending...") return "" elif task_get_option("quick") == "yes": #not used -R option, fast calculation (not accurate) write_message("Beginning post-processing of %s terms" % len(terms)) #Locating all documents related to the modified/new/deleted terms, if fast update, #only take into account new/modified occurences write_message("Phase 1: Finding records containing modified terms") terms = terms.keys() i = 0 while i < len(terms): terms_docs = get_from_forward_index(terms, i, (i+5000), table) for (t, hitlist) in terms_docs: term_docs = deserialize_via_marshal(hitlist) if term_docs.has_key("Gi"): del term_docs["Gi"] for (j, tf) in term_docs.iteritems(): if (task_get_option("quick") == "yes" and tf[1] == 0) or task_get_option("quick") == "no": Nj[j] = 0 write_message("Phase 1: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms))) i += 5000 write_message("Phase 1: Finished finding records containing modified terms") #Find all terms in the records found in last phase write_message("Phase 2: Finding all terms in affected records") records = Nj.keys() i = 0 while i < len(records): docs_terms = get_from_reverse_index(records, i, (i + 5000), table) for (j, termlist) in docs_terms: doc_terms = deserialize_via_marshal(termlist) for (t, tf) in doc_terms.iteritems(): Gi[t] = 0 write_message("Phase 2: ......processed %s/%s records " % ((i+5000>len(records) and len(records) or (i+5000)), len(records))) i += 5000 write_message("Phase 2: Finished finding all terms in affected records") else: #recalculate max_id = run_sql("SELECT MAX(id) FROM %s" % table) max_id = max_id[0][0] write_message("Beginning recalculation of %s terms" % max_id) terms = [] i = 0 while i < max_id: terms_docs = get_from_forward_index_with_id(i, (i+5000), table) for (t, hitlist) in terms_docs: Gi[t] = 0 term_docs = deserialize_via_marshal(hitlist) if term_docs.has_key("Gi"): del term_docs["Gi"] for (j, tf) in term_docs.iteritems(): Nj[j] = 0 write_message("Phase 1: ......processed %s/%s terms" % ((i+5000)>max_id and max_id or (i+5000), max_id)) i += 5000 write_message("Phase 1: Finished finding which records contains which terms") write_message("Phase 2: Jumping over..already done in phase 1 because of -R option") terms = Gi.keys() Gi = {} i = 0 if task_get_option("quick") == "no": #Calculating Fi and Gi value for each term write_message("Phase 3: Calculating importance of all affected terms") while i < len(terms): terms_docs = get_from_forward_index(terms, i, (i+5000), table) for (t, hitlist) in terms_docs: term_docs = deserialize_via_marshal(hitlist) if term_docs.has_key("Gi"): del term_docs["Gi"] Fi = 0 Gi[t] = 1 for (j, tf) in term_docs.iteritems(): Fi += tf[0] for (j, tf) in term_docs.iteritems(): if tf[0] != Fi: Gi[t] = Gi[t] + ((float(tf[0]) / Fi) * math.log(float(tf[0]) / Fi) / math.log(2)) / math.log(N) write_message("Phase 3: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms))) i += 5000 write_message("Phase 3: Finished calculating importance of all affected terms") else: #Using existing Gi value instead of calculating a new one. Missing some accurancy. write_message("Phase 3: Getting approximate importance of all affected terms") while i < len(terms): terms_docs = get_from_forward_index(terms, i, (i+5000), table) for (t, hitlist) in terms_docs: term_docs = deserialize_via_marshal(hitlist) if term_docs.has_key("Gi"): Gi[t] = term_docs["Gi"][1] elif len(term_docs) == 1: Gi[t] = 1 else: Fi = 0 Gi[t] = 1 for (j, tf) in term_docs.iteritems(): Fi += tf[0] for (j, tf) in term_docs.iteritems(): if tf[0] != Fi: Gi[t] = Gi[t] + ((float(tf[0]) / Fi) * math.log(float(tf[0]) / Fi) / math.log(2)) / math.log(N) write_message("Phase 3: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms))) i += 5000 write_message("Phase 3: Finished getting approximate importance of all affected terms") write_message("Phase 4: Calculating normalization value for all affected records and updating %sR" % table[:-1]) records = Nj.keys() i = 0 while i < len(records): #Calculating the normalization value for each document, and adding the Gi value to each term in each document. docs_terms = get_from_reverse_index(records, i, (i + 5000), table) for (j, termlist) in docs_terms: doc_terms = deserialize_via_marshal(termlist) try: for (t, tf) in doc_terms.iteritems(): if Gi.has_key(t): Nj[j] = Nj.get(j, 0) + math.pow(Gi[t] * (1 + math.log(tf[0])), 2) Git = int(math.floor(Gi[t]*100)) if Git >= 0: Git += 1 doc_terms[t] = (tf[0], Git) else: Nj[j] = Nj.get(j, 0) + math.pow(tf[1] * (1 + math.log(tf[0])), 2) Nj[j] = 1.0 / math.sqrt(Nj[j]) Nj[j] = int(Nj[j] * 100) if Nj[j] >= 0: Nj[j] += 1 - run_sql("UPDATE %sR SET termlist=%%s WHERE id_bibrec=%%s" % table[:-1], + run_sql("UPDATE %sR SET termlist=_binary %%s WHERE id_bibrec=%%s" % table[:-1], (serialize_via_marshal(doc_terms), j)) except (ZeroDivisionError, OverflowError), e: ## This is to try to isolate division by zero errors. write_message(zero_division_msg % (e, CFG_SITE_URL), stream=sys.stderr) register_exception(prefix=zero_division_msg % (e, CFG_SITE_URL), alert_admin=True) write_message("Phase 4: ......processed %s/%s records" % ((i+5000>len(records) and len(records) or (i+5000)), len(records))) i += 5000 write_message("Phase 4: Finished calculating normalization value for all affected records and updating %sR" % table[:-1]) write_message("Phase 5: Updating %s with new normalization values" % table) i = 0 terms = Gi.keys() while i < len(terms): #Adding the Gi value to each term, and adding the normalization value to each term in each document. terms_docs = get_from_forward_index(terms, i, (i+5000), table) for (t, hitlist) in terms_docs: try: term_docs = deserialize_via_marshal(hitlist) if term_docs.has_key("Gi"): del term_docs["Gi"] for (j, tf) in term_docs.iteritems(): if Nj.has_key(j): term_docs[j] = (tf[0], Nj[j]) Git = int(math.floor(Gi[t]*100)) if Git >= 0: Git += 1 term_docs["Gi"] = (0, Git) - run_sql("UPDATE %s SET hitlist=%%s WHERE term=%%s" % table, + run_sql("UPDATE %s SET hitlist=_binary %%s WHERE term=%%s" % table, (serialize_via_marshal(term_docs), t)) except (ZeroDivisionError, OverflowError), e: write_message(zero_division_msg % (e, CFG_SITE_URL), stream=sys.stderr) register_exception(prefix=zero_division_msg % (e, CFG_SITE_URL), alert_admin=True) write_message("Phase 5: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms))) i += 5000 write_message("Phase 5: Finished updating %s with new normalization values" % table) write_message("Time used for post-processing: %.1fmin" % ((time.time() - stime) / 60)) write_message("Finished post-processing") def get_from_forward_index(terms, start, stop, table): terms_docs = () for j in range(start, (stop < len(terms) and stop or len(terms))): terms_docs += run_sql("SELECT term, hitlist FROM %s WHERE term=%%s" % table, (terms[j],)) return terms_docs def get_from_forward_index_with_id(start, stop, table): terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE id BETWEEN %s AND %s" % (table, start, stop)) return terms_docs def get_from_reverse_index(records, start, stop, table): current_recs = "%s" % records[start:stop] current_recs = current_recs[1:-1] docs_terms = run_sql("SELECT id_bibrec, termlist FROM %sR WHERE id_bibrec IN (%s)" % (table[:-1], current_recs)) return docs_terms #def test_word_separators(phrase="hep-th/0101001"): #"""Tests word separating policy on various input.""" #print "%s:" % phrase #gwfp = get_words_from_phrase(phrase) #for (word, count) in gwfp.iteritems(): #print "\t-> %s - %s" % (word, count) def getName(methname, ln=CFG_SITE_LANG, type='ln'): """Returns the name of the rank method, either in default language or given language. methname = short name of the method ln - the language to get the name in type - which name "type" to get.""" try: rnkid = run_sql("SELECT id FROM rnkMETHOD where name='%s'" % methname) if rnkid: rnkid = str(rnkid[0][0]) res = run_sql("SELECT value FROM rnkMETHODNAME where type='%s' and ln='%s' and id_rnkMETHOD=%s" % (type, ln, rnkid)) if not res: res = run_sql("SELECT value FROM rnkMETHODNAME WHERE ln='%s' and id_rnkMETHOD=%s and type='%s'" % (CFG_SITE_LANG, rnkid, type)) if not res: return methname return res[0][0] else: raise Exception except Exception, e: write_message("Cannot run rank method, either given code for method is wrong, or it has not been added using the webinterface.") raise Exception def word_similarity(run): """Call correct method""" return word_index(run) diff --git a/modules/bibsort/lib/bibsort_engine.py b/modules/bibsort/lib/bibsort_engine.py index 6d0d9d959..9294ccf81 100644 --- a/modules/bibsort/lib/bibsort_engine.py +++ b/modules/bibsort/lib/bibsort_engine.py @@ -1,946 +1,946 @@ # -*- mode: python; coding: utf-8; -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012 CERN. +# Copyright (C) 2010, 2011, 2012, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibSort Engine""" import sys import time from invenio.dateutils import datetime, strftime from invenio.dbquery import deserialize_via_marshal, \ serialize_via_marshal, run_sql, Error from invenio.search_engine import get_field_tags, search_pattern from invenio.intbitset import intbitset from invenio.bibtask import write_message, task_update_progress, \ task_sleep_now_if_required from invenio.config import CFG_BIBSORT_BUCKETS, CFG_CERN_SITE from invenio.bibsort_washer import BibSortWasher, \ InvenioBibSortWasherNotImplementedError import invenio.template websearch_templates = invenio.template.load('websearch') #The space distance between elements, to make inserts faster CFG_BIBSORT_WEIGHT_DISTANCE = 8 def get_bibsort_methods_details(method_list = None): """Returns the id, definition, and washer for the methods in method_list. If no method_list is specified: we get all the data from bsrMETHOD table""" bibsort_methods = {} errors = False results = [] if not method_list: try: results = run_sql("SELECT id, name, definition, washer \ FROM bsrMETHOD") except Error, err: write_message("The error: [%s] occured while trying to read " \ "the bibsort data from the database." \ %err, stream=sys.stderr) return {}, True if not results: write_message("The bsrMETHOD table is empty.") return {}, errors else: for method in method_list: try: res = run_sql("""SELECT id, name, definition, washer \ FROM bsrMETHOD where name = %s""", (method, )) except Error, err: write_message("The error: [%s] occured while trying to get " \ "the bibsort data from the database for method %s." \ %(err, method), stream=sys.stderr) errors = True if not res: write_message("No information for method: %s." % method) else: results.append(res[0]) for item in results: bibsort_methods.setdefault(item[1], {})['id'] = item[0] bibsort_methods[item[1]]['definition'] = item[2] bibsort_methods[item[1]]['washer'] = item[3] return bibsort_methods, errors def get_all_recids(including_deleted=True):#6.68s on cdsdev """Returns a list of all records available in the system""" res = run_sql("SELECT id FROM bibrec") if not res: return intbitset([]) all_recs = intbitset(res) if not including_deleted: # we want to exclude deleted records if CFG_CERN_SITE: deleted = search_pattern(p='980__:"DELETED" OR 980__:"DUMMY"') else: deleted = search_pattern(p='980__:"DELETED"') all_recs.difference_update(deleted) return all_recs def get_max_recid(): """Returns the max id in bibrec - good approximation for the total number of records""" try: return run_sql("SELECT MAX(id) FROM bibrec")[0][0] except IndexError: return 0 def _get_values_from_marc_tag(tag, recids): '''Finds the value for a specific tag''' digits = tag[0:2] try: intdigits = int(digits) if intdigits < 0 or intdigits > 99: raise ValueError except ValueError: # invalid tag value asked for write_message('You have asked for an invalid tag value ' \ '[tag=%s; value=%s].' %(tag, intdigits), verbose=5) return [] bx = "bib%sx" % digits bibx = "bibrec_bib%sx" % digits max_recid = get_max_recid() if len(recids) == 1: to_append = '= %s' query_params = [recids.tolist()[0]] elif len(recids) < max_recid/3: # if we have less then one third of the records # use IN #This realy depends on how large the repository is.. to_append = 'IN %s' query_params = [tuple(recids)] else: # mysql might crush with big queries, better use BETWEEN to_append = 'BETWEEN %s AND %s' query_params = [1, max_recid] query = 'SELECT bibx.id_bibrec, bx.value \ FROM %s AS bx, %s AS bibx \ WHERE bibx.id_bibrec %s \ AND bx.id = bibx.id_bibxxx \ AND bx.tag LIKE %%s' % (bx, bibx, to_append) query_params.append(tag) res = run_sql(query, tuple(query_params)) return res def get_data_for_definition_marc(tags, recids): '''Having a list of tags and a list of recids, it returns a dictionary with the values correspondig to the tags''' #x = all_recids; [get_fieldvalues(recid, '037__a') for recid in x] #user: 140s, sys: 21s, total: 160s - cdsdev if isinstance(recids, (int, long)): recids = intbitset([recids, ]) # for each recid we need only one value #on which we sort, so we can stop looking for a value # as soon as we find one tag_index = 0 field_data_dict = {} while len(recids) > 0 and tag_index < len(tags): write_message('%s records queried for values for tags %s.' \ %(len(recids), tags), verbose=5) res = _get_values_from_marc_tag(tags[tag_index], recids) res_dict = dict(res) #field_data_dict.update(res_dict) #we can not use this, because res_dict might contain recids #that are already in field_data_dict, and we should not overwrite their value field_data_dict = dict(res_dict, **field_data_dict) #there might be keys that we do not want (ex: using 'between') #so we should remove them res_dict_keys = intbitset(res_dict.keys()) recids_not_needed = res_dict_keys.difference(recids) for recid in recids_not_needed: del field_data_dict[recid] #update the recids to contain only the recid that do not have values yet recids.difference_update(res_dict_keys) tag_index += 1 return field_data_dict def get_data_for_definition_rnk(method_name, rnk_name): '''Returns the dictionary with data for method_name ranking method''' try: res = run_sql('SELECT d.relevance_data \ from rnkMETHODDATA d, rnkMETHOD r WHERE \ d.id_rnkMETHOD = r.id AND \ r.name = %s', (rnk_name, )) if res and res[0]: write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \ %method_name, verbose=5) return deserialize_via_marshal(res[0][0]) except Error, err: write_message("No data could be found for sorting method %s. " \ "The following errror occured: [%s]" \ %(method_name, err), stream=sys.stderr) return {} def get_data_for_definition_bibrec(column_name, recids_copy): '''Having a column_name and a list of recids, it returns a dictionary mapping each recids with its correspondig value from the column''' dict_column = {} for recid in recids_copy: creation_date = run_sql('SELECT %s from bibrec WHERE id = %%s' %column_name, (recid, ))[0][0] new_creation_date = datetime(creation_date.year,creation_date.month,creation_date.day, \ creation_date.hour,creation_date.minute, creation_date.second) dict_column[recid] = new_creation_date.strftime('%Y%m%d%H%M%S') return dict_column def get_field_data(recids, method_name, definition): """Returns the data associated with the definition for recids. The returned dictionary will contain ONLY the recids for which a value has been found in the database. """ recids_copy = recids.copy() #if we are dealing with a MARC definition if definition.startswith('MARC'): tags = definition.replace('MARC:', '').replace(' ', '').strip().split(',') if not tags: write_message('No MARC tags found for method %s.' \ %method_name, verbose=5) return {} write_message('The following MARC tags will be queried: %s' %tags, \ verbose=5) return get_data_for_definition_marc(tags, recids_copy) #if we are dealing with tags (ex: author, title) elif definition.startswith('FIELD'): tags = get_field_tags(definition.replace('FIELD:', '').strip()) if not tags: write_message('No tags found for method %s.' \ %method_name, verbose=5) return {} write_message('The following tags will be queried: %s' %tags, verbose=5) return get_data_for_definition_marc(tags, recids_copy) # if we are dealing with ranking data elif definition.startswith('RNK'): rnk_name = definition.replace('RNK:', '').strip() return get_data_for_definition_rnk(method_name, rnk_name) # if we are looking into bibrec table elif definition.startswith('BIBREC'): column_name = definition.replace('BIBREC:', '').strip() return get_data_for_definition_bibrec(column_name, recids_copy) else: write_message("The definition %s for method % could not be recognized" \ %(definition, method_name), stream=sys.stderr) return {} def apply_washer(data_dict, washer): '''The values are filtered using the washer function''' if not washer: return if washer.strip() == 'NOOP': return washer = washer.split(':')[0]#in case we have a locale defined try: method = BibSortWasher(washer) write_message('Washer method found: %s' %washer, verbose=5) for recid in data_dict: new_val = method.get_transformed_value(data_dict[recid]) data_dict[recid] = new_val except InvenioBibSortWasherNotImplementedError, err: write_message("Washer %s is not implemented [%s]." \ %(washer, err), stream=sys.stderr) def locale_for_sorting(washer): """Identifies if any specific locale should be used, and it returns it""" if washer.find(":") > -1: lang = washer[washer.index(':')+1:] return websearch_templates.tmpl_localemap.get(lang, websearch_templates.tmpl_default_locale) return None def run_sorting_method(recids, method_name, method_id, definition, washer): """Does the actual sorting for the method_name for all the records in the database""" run_sorting_for_rnk = False if definition.startswith('RNK'): run_sorting_for_rnk = True field_data_dictionary = get_field_data(recids, method_name, definition) if not field_data_dictionary: write_message("POSSIBLE ERROR: The sorting method --%s-- has no data!" \ %method_name) return True apply_washer(field_data_dictionary, washer) #do we have any locale constraint? sorting_locale = locale_for_sorting(washer) sorted_data_list, sorted_data_dict = \ sort_dict(field_data_dictionary, CFG_BIBSORT_WEIGHT_DISTANCE, run_sorting_for_rnk, sorting_locale) executed = write_to_methoddata_table(method_id, field_data_dictionary, \ sorted_data_dict, sorted_data_list) if not executed: return False if CFG_BIBSORT_BUCKETS > 1: bucket_dict, bucket_last_rec_dict = split_into_buckets(sorted_data_list, len(sorted_data_list)) for idx in bucket_dict: executed = write_to_buckets_table(method_id, idx, bucket_dict[idx], \ sorted_data_dict[bucket_last_rec_dict[idx]]) if not executed: return False else: executed = write_to_buckets_table(method_id, 1, intbitset(sorted_data_list), \ sorted_data_list[-1]) if not executed: return False return True def write_to_methoddata_table(id_method, data_dict, data_dict_ordered, data_list_sorted, update_timestamp=True): """Serialize the date and write it to the bsrMETHODDATA""" write_message('Starting serializing the data..', verbose=5) serialized_data_dict = serialize_via_marshal(data_dict) serialized_data_dict_ordered = serialize_via_marshal(data_dict_ordered) serialized_data_list_sorted = serialize_via_marshal(data_list_sorted) write_message('Serialization completed.', verbose=5) date = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if not update_timestamp: try: date = run_sql('SELECT last_updated from bsrMETHODDATA WHERE id_bsrMETHOD = %s', (id_method, ))[0][0] except IndexError: pass # keep the generated date write_message("Starting writing the data for method_id=%s " \ "to the database (table bsrMETHODDATA)" %id_method, verbose=5) try: write_message('Deleting old data..', verbose=5) run_sql("DELETE FROM bsrMETHODDATA WHERE id_bsrMETHOD = %s", (id_method, )) write_message('Inserting new data..', verbose=5) run_sql("INSERT into bsrMETHODDATA \ (id_bsrMETHOD, data_dict, data_dict_ordered, data_list_sorted, last_updated) \ - VALUES (%s, %s, %s, %s, %s)", \ + VALUES (%s, _binary %s, _binary %s, _binary %s, %s)", \ (id_method, serialized_data_dict, serialized_data_dict_ordered, \ serialized_data_list_sorted, date, )) except Error, err: write_message("The error [%s] occured when inserting new bibsort data "\ "into bsrMETHODATA table" %err, sys.stderr) return False write_message('Writing to the bsrMETHODDATA successfully completed.', \ verbose=5) return True def write_to_buckets_table(id_method, bucket_no, bucket_data, bucket_last_value, update_timestamp=True): """Serialize the date and write it to the bsrMEHODDATA_BUCKETS""" write_message('Writing the data for bucket number %s for ' \ 'method_id=%s to the database' \ %(bucket_no, id_method), verbose=5) write_message('Serializing data for bucket number %s' %bucket_no, verbose=5) serialized_bucket_data = bucket_data.fastdump() date = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if not update_timestamp: try: date = run_sql('SELECT last_updated from bsrMETHODDATABUCKET WHERE id_bsrMETHOD = %s and bucket_no = %s', \ (id_method, bucket_no))[0][0] except IndexError: pass # keep the generated date try: write_message('Deleting old data.', verbose=5) run_sql("DELETE FROM bsrMETHODDATABUCKET \ WHERE id_bsrMETHOD = %s AND bucket_no = %s", \ (id_method, bucket_no, )) write_message('Inserting new data.', verbose=5) run_sql("INSERT into bsrMETHODDATABUCKET \ (id_bsrMETHOD, bucket_no, bucket_data, bucket_last_value, last_updated) \ - VALUES (%s, %s, %s, %s, %s)", \ + VALUES (%s, %s, _binary %s, %s, %s)", \ (id_method, bucket_no, serialized_bucket_data, bucket_last_value, date, )) except Error, err: write_message("The error [%s] occured when inserting new bibsort data " \ "into bsrMETHODATA_BUCKETS table" %err, sys.stderr) return False write_message('Writing to bsrMETHODDATABUCKET for ' \ 'bucket number %s completed.' %bucket_no, verbose=5) return True def split_into_buckets(sorted_data_list, data_size): """The sorted_data_list is split into equal buckets. Returns a dictionary containing the buckets and a dictionary containing the last record in each bucket""" write_message("Starting splitting the data into %s buckets." \ %CFG_BIBSORT_BUCKETS, verbose=5) bucket_dict = {} bucket_last_rec_dict = {} step = data_size/CFG_BIBSORT_BUCKETS i = 0 for i in xrange(CFG_BIBSORT_BUCKETS - 1): bucket_dict[i+1] = intbitset(sorted_data_list[i*step:i*step+step]) bucket_last_rec_dict[i+1] = sorted_data_list[i*step+step-1] write_message("Bucket %s done." %(i+1), verbose=5) #last bucket contains all the remaining data bucket_dict[CFG_BIBSORT_BUCKETS] = intbitset(sorted_data_list[(i+1)*step:]) bucket_last_rec_dict[CFG_BIBSORT_BUCKETS] = sorted_data_list[-1] write_message("Bucket %s done." %CFG_BIBSORT_BUCKETS, verbose=5) write_message("Splitting completed.", verbose=5) return bucket_dict, bucket_last_rec_dict def sort_dict(dictionary, spacing=1, run_sorting_for_rnk=False, sorting_locale=None): """Sorting a dictionary. Returns a list of sorted recids and also a dictionary containing the recid: weight weight = index * spacing""" #10Mil records dictionary -> 36.9s write_message("Starting sorting the dictionary " \ "containing all the data..", verbose=5) sorted_records_dict_with_id = {} if sorting_locale: import locale orig_locale = locale.getlocale(locale.LC_ALL) try: locale.setlocale(locale.LC_ALL, sorting_locale) except locale.Error: try: locale.setlocale(locale.LC_ALL, sorting_locale + '.UTF8') except locale.Error: write_message("Setting locale to %s is not working.. ignoring locale") sorted_records_list = sorted(dictionary, key=dictionary.__getitem__, cmp=locale.strcoll, reverse=False) locale.setlocale(locale.LC_ALL, orig_locale) else: sorted_records_list = sorted(dictionary, key=dictionary.__getitem__, reverse=False) if run_sorting_for_rnk: #for ranking, we can keep the actual values associated with the recids return sorted_records_list, dictionary else: index = 1 for recid in sorted_records_list: sorted_records_dict_with_id[recid] = index * spacing index += 1 write_message("Dictionary sorted.", verbose=5) return sorted_records_list, sorted_records_dict_with_id def get_modified_or_inserted_recs(method_list): """Returns a list of recids that have been inserted or modified since the last update of the bibsort methods in method_list method_list should already contain a list of methods that SHOULD be updated, if it contains new methods, an error will be thrown""" if not method_list: #just to be on the safe side return 0 try: query = "SELECT min(d.last_updated) from bsrMETHODDATA d, bsrMETHOD m \ WHERE m.name in (%s) AND d.id_bsrMETHOD = m.id" % \ ("%s," * len(method_list))[:-1] last_updated = str(run_sql(query, tuple(method_list))[0][0]) except Error, err: write_message("Error when trying to get the last_updated date " \ "from bsrMETHODDATA: [%s]" %err, sys.stderr) return 0 recids = [] try: results = run_sql("SELECT id from bibrec \ where modification_date >= %s", (last_updated, )) if results: recids = [result[0] for result in results] except Error, err: write_message("Error when trying to get the list of " \ "modified records: [%s]" %err, sys.stderr) return 0 return recids def get_rnk_methods(bibsort_methods): """Returns the list of bibsort methods (names) that are RNK methods""" return [method for method in bibsort_methods if \ bibsort_methods[method]['definition'].startswith('RNK')] def get_modified_non_rnk_methods(non_rnk_method_list): """Returns 2 lists of non RNK methods: updated_ranking_methods = non RNK methods that need to be updated inserted_ranking_methods = non RNK methods, that have no data yet, so rebalancing should run on them""" updated_ranking_methods = [] inserted_ranking_methods = [] for method in non_rnk_method_list: try: dummy = str(run_sql('SELECT d.last_updated \ FROM bsrMETHODDATA d, bsrMETHOD m \ WHERE m.id = d.id_bsrMETHOD \ AND m.name=%s', (method, ))[0][0]) updated_ranking_methods.append(method) except IndexError: #method is not in bsrMETHODDATA -> is new inserted_ranking_methods.append(method) return updated_ranking_methods, inserted_ranking_methods def get_modified_rnk_methods(rnk_method_list, bibsort_methods): """Returns the list of RNK methods that have been recently modified, so they will need to have their bibsort data updated""" updated_ranking_methods = [] deleted_ranking_methods = [] for method in rnk_method_list: method_name = bibsort_methods[method]['definition'].replace('RNK:', '').strip() try: last_updated_rnk = str(run_sql('SELECT last_updated \ FROM rnkMETHOD \ WHERE name = %s', (method_name, ))[0][0]) except IndexError: write_message("The method %s could not be found in rnkMETHOD" \ %(method_name), stream=sys.stderr) #this method does not exist in rnkMETHOD, #it might have been a mistype or it might have been deleted deleted_ranking_methods.append(method) if method not in deleted_ranking_methods: try: last_updated_bsr = str(run_sql('SELECT d.last_updated \ FROM bsrMETHODDATA d, bsrMETHOD m \ WHERE m.id = d.id_bsrMETHOD \ AND m.name=%s', (method, ))[0][0]) if last_updated_rnk >= last_updated_bsr: # rnk data has been updated after bibsort ran updated_ranking_methods.append(method) else: write_message("The method %s has not been updated "\ "since the last run of bibsort." %method) except IndexError: write_message("The method %s could not be found in bsrMETHODDATA" \ %(method)) # that means that the bibsort never run on this method, so let's run it updated_ranking_methods.append(method) return updated_ranking_methods, deleted_ranking_methods def delete_bibsort_data_for_method(method_id): """This method will delete all data asociated with a method from bibsort tables (except bsrMETHOD). Returns False in case some error occured, True otherwise""" try: run_sql("DELETE FROM bsrMETHODDATA WHERE id_bsrMETHOD = %s", (method_id, )) run_sql("DELETE FROM bsrMETHODDATABUCKET WHERE id_bsrMETHOD = %s", (method_id, )) except: return False return True def delete_all_data_for_method(method_id): """This method will delete all data asociated with a method from bibsort tables. Returns False in case some error occured, True otherwise""" method_name = 'method name' try: run_sql("DELETE FROM bsrMETHODDATA WHERE id_bsrMETHOD = %s", (method_id, )) run_sql("DELETE FROM bsrMETHODDATABUCKET WHERE id_bsrMETHOD = %s", (method_id, )) run_sql("DELETE FROM bsrMETHODNAME WHERE id_bsrMETHOD = %s", (method_id, )) run_sql("DELETE FROM bsrMETHOD WHERE id = %s", (method_id, )) method_name = run_sql("SELECT name from bsrMETHOD WHERE id = %s", (method_id, ))[0][0] except Error: return False except IndexError: return True if method_name:# the method has not been deleted return False return True def add_sorting_method(method_name, method_definition, method_treatment): """This method will add a new sorting method in the database and update the config file""" try: run_sql("INSERT INTO bsrMETHOD(name, definition, washer) \ VALUES (%s, %s, %s)", (method_name, method_definition, method_treatment)) except Error: return False return True def update_bibsort_tables(recids, method, update_timestamp = True): """Updates the data structures for sorting method: method for the records in recids""" res = run_sql("SELECT id, definition, washer \ from bsrMETHOD where name = %s", (method, )) if res and res[0]: method_id = res[0][0] definition = res[0][1] washer = res[0][2] else: write_message('No sorting method called %s could be found ' \ 'in bsrMETHOD table.' %method, sys.stderr) return False res = run_sql("SELECT data_dict, data_dict_ordered, data_list_sorted \ FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, )) if res and res[0]: data_dict = deserialize_via_marshal(res[0][0]) data_dict_ordered = {} data_list_sorted = [] else: write_message('No data could be found for the sorting method %s.' \ %method) return False #since this case should have been treated earlier #get the values for the recids that need to be recalculated field_data = get_field_data(recids, method, definition) if not field_data: write_message("Possible error: the method %s has no data for records %s." \ %(method, str(recids))) else: apply_washer(field_data, washer) #if a recid is not in field_data that is because no value was found for it #so it should be marked for deletion recids_to_delete = list(recids.difference(intbitset(field_data.keys()))) recids_to_insert = [] recids_to_modify = {} for recid in field_data: if recid in data_dict: if data_dict[recid] != field_data[recid]: #we store the old value recids_to_modify[recid] = data_dict[recid] else: # recid is new, and needs to be inserted recids_to_insert.append(recid) #remove the recids that were not previously in bibsort recids_to_delete = [recid for recid in recids_to_delete if recid in data_dict] #dicts to keep the ordered values for the recids - useful bor bucket insertion recids_current_ordered = {} recids_old_ordered = {} if recids_to_insert or recids_to_modify or recids_to_delete: data_dict_ordered = deserialize_via_marshal(res[0][1]) data_list_sorted = deserialize_via_marshal(res[0][2]) if recids_to_modify: write_message("%s records have been modified." \ %len(recids_to_modify), verbose=5) for recid in recids_to_modify: recids_old_ordered[recid] = data_dict_ordered[recid] perform_modify_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_insert: write_message("%s records have been inserted." \ %len(recids_to_insert), verbose=5) for recid in recids_to_insert: perform_insert_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_delete: write_message("%s records have been deleted." \ %len(recids_to_delete), verbose=5) for recid in recids_to_delete: perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid) for recid in recids_to_modify: recids_current_ordered[recid] = data_dict_ordered[recid] for recid in recids_to_insert: recids_current_ordered[recid] = data_dict_ordered[recid] #write the modifications to db executed = write_to_methoddata_table(method_id, data_dict, \ data_dict_ordered, data_list_sorted, update_timestamp) if not executed: return False #update buckets try: perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp) except Error, err: write_message("[%s] The bucket data for method %s has not been updated" \ %(method, err), sys.stderr) return False return True def perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp = True): """Updates the buckets""" bucket_insert = {} bucket_delete = {} write_message("Updating the buckets for method_id = %s" %method_id, verbose=5) buckets = run_sql("SELECT bucket_no, bucket_last_value \ FROM bsrMETHODDATABUCKET \ WHERE id_bsrMETHOD = %s", (method_id, )) if not buckets: write_message("No bucket data found for method_id %s." \ %method_id, sys.stderr) raise Exception #sort the buckets to be sure we are iterating them in order(1 to max): buckets_dict = dict(buckets) for recid in recids_to_insert: for bucket_no in buckets_dict: if recids_current_ordered[recid] <= buckets_dict[bucket_no]: bucket_insert.setdefault(bucket_no, []).append(recid) break for recid in recids_old_ordered: record_inserted = 0 record_deleted = 0 for bucket_no in buckets_dict: bucket_value = int(buckets_dict[bucket_no]) if record_inserted and record_deleted: #both insertion and deletion have been registered break if recids_current_ordered[recid] <= bucket_value and \ recids_old_ordered[recid] <= bucket_value and \ not record_inserted and \ not record_deleted: #both before and after the modif, #recid should be in the same bucket -> nothing to do break if recids_current_ordered[recid] <= bucket_value and not record_inserted: #recid should be, after the modif, here, so insert bucket_insert.setdefault(bucket_no, []).append(recid) record_inserted = 1 if recids_old_ordered[recid] <= bucket_value and not record_deleted: #recid was here before modif, must be removed bucket_delete.setdefault(bucket_no, []).append(recid) record_deleted = 1 for bucket_no in buckets_dict: if (bucket_no in bucket_insert) or (bucket_no in bucket_delete): res = run_sql("SELECT bucket_data FROM bsrMETHODDATABUCKET \ where id_bsrMETHOD = %s AND bucket_no = %s", \ (method_id, bucket_no, )) bucket_data = intbitset(res[0][0]) for recid in bucket_insert.get(bucket_no, []): bucket_data.add(recid) for recid in bucket_delete.get(bucket_no, []): if recid in bucket_data: bucket_data.remove(recid) if update_timestamp: date = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("UPDATE bsrMETHODDATABUCKET \ - SET bucket_data = %s, last_updated = %s \ + SET bucket_data = _binary %s, last_updated = %s \ WHERE id_bsrMETHOD = %s AND bucket_no = %s", \ (bucket_data.fastdump(), date, method_id, bucket_no, )) else: run_sql("UPDATE bsrMETHODDATABUCKET \ - SET bucket_data = %s \ + SET bucket_data = _binary %s \ WHERE id_bsrMETHOD = %s AND bucket_no = %s", \ (bucket_data.fastdump(), method_id, bucket_no, )) write_message("Updating bucket %s for method %s." %(bucket_no, method_id), verbose=5) def perform_modify_record(data_dict, data_dict_ordered, data_list_sorted, value, recid, spacing=CFG_BIBSORT_WEIGHT_DISTANCE): """Modifies all the data structures with the new information about the record""" #remove the recid from the old position, to make place for the new value data_list_sorted.remove(recid) # from now on, it is the same thing as insert return perform_insert_record(data_dict, data_dict_ordered, data_list_sorted, value, recid, spacing) def perform_insert_record(data_dict, data_dict_ordered, data_list_sorted, value, recid, spacing=CFG_BIBSORT_WEIGHT_DISTANCE): """Inserts a new record into all the data structures""" #data_dict data_dict[recid] = value #data_dict_ordered & data_list_sorted #calculate at which index the rec should be inserted in data_list_sorted index_for_insert = binary_search(data_list_sorted, value, data_dict) #we have to calculate the weight of this record in data_dict_ordered #and it will be the med between its neighbours in the data_list_sorted if index_for_insert == len(data_list_sorted):#insert at the end of the list #append at the end of the list data_list_sorted.append(recid) #weight = highest weight + the distance data_dict_ordered[recid] = data_dict_ordered[data_list_sorted[index_for_insert - 1]] + spacing else: if index_for_insert == 0: #insert at the begining of the list left_neighbor_weight = 0 else: left_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert - 1]] right_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert]] #the recid's weight will be the med between left and right weight = (right_neighbor_weight - left_neighbor_weight)/2 if weight < 1: #there is no more space to insert, we have to create some space data_list_sorted.insert(index_for_insert, recid) data_dict_ordered[recid] = left_neighbor_weight + spacing create_space_for_new_weight(index_for_insert, data_dict_ordered, data_list_sorted, spacing) else: data_list_sorted.insert(index_for_insert, recid) data_dict_ordered[recid] = left_neighbor_weight + weight write_message("Record %s done." %recid, verbose=5) return index_for_insert def perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid): """Delete a record from all the data structures""" #data_dict del data_dict[recid] #data_list_sorted data_list_sorted.remove(recid) #data_dict_ordered del data_dict_ordered[recid] write_message("Record %s done." %recid, verbose=5) return 1 def create_space_for_new_weight(index_for_insert, data_dict_ordered, data_list_sorted, spacing): """In order to keep an order of the records in data_dict_ordered, when a new weight is inserted, there needs to be some place for it (ex: recid3 needs to be inserted between recid1-with weight=10 and recid2-with weight=11) The scope of this function is to increease the distance between recid1 and recid2 (and thus all the weights after recid2) so that recid3 will have an integer weight""" for i in range(index_for_insert+1, len(data_list_sorted)): data_dict_ordered[data_list_sorted[i]] += 2 * spacing def binary_search(sorted_list, value, data_dict): """Binary Search O(log n)""" minimum = -1 maximum = len(sorted_list) while maximum - minimum > 1: med = (maximum+minimum)/2 recid1 = sorted_list[med] value1 = data_dict[recid1] if value1 > value: maximum = med elif value1 < value: minimum = med else: return med return minimum + 1 def run_bibsort_update(recids=None, method_list=None): """Updates bibsort tables for the methods in method_list and for the records in recids. If recids is None: recids = all records that have been modified or inserted since last update If method_list is None: method_list = all the methods available in bsrMETHOD table""" write_message('Initial data for run_bibsort_update method: ' \ 'number of recids = %s; method_list=%s' \ %(str(len(recids)), method_list), verbose=5) write_message('Updating sorting data.') bibsort_methods, errors = get_bibsort_methods_details(method_list) if errors: return False method_list = bibsort_methods.keys() if not method_list: write_message('No methods found in bsrMETHOD table.. exiting.') return True #we could have 4 types of methods: #(i) RNK methods -> they should be rebalanced, not updated #(ii) RNK methods to delete -> we should delete their data #(iii) non RNK methods to update #(iv) non RNK methods that are new -> they should be rebalanced(sorted), not updated #check which of the methods are RNK methods (they do not need modified recids) rnk_methods = get_rnk_methods(bibsort_methods) rnk_methods_updated, rnk_methods_deleted = get_modified_rnk_methods(rnk_methods, bibsort_methods) #check which of the methods have no data, so they are actually new, #so they need balancing(sorting) instead of updating non_rnk_methods = [method for method in bibsort_methods.keys() if method not in rnk_methods] non_rnk_methods_updated, non_rnk_methods_inserted = get_modified_non_rnk_methods(non_rnk_methods) #(i) + (iv) methods_to_balance = rnk_methods_updated + non_rnk_methods_inserted if methods_to_balance: # several methods require rebalancing(sorting) and not updating return run_bibsort_rebalance(methods_to_balance) #(ii) #remove the data for the ranking methods that have been deleted for method in rnk_methods_deleted: task_sleep_now_if_required(can_stop_too=True) task_update_progress("Deleting data for method %s" %method) write_message('Starting deleting the data for RNK method %s' %method, verbose=5) executed_ok = delete_bibsort_data_for_method(bibsort_methods[method]['id']) if not executed_ok: write_message('Method %s could not be deleted correctly, aborting..' \ %method, sys.stderr) return False #(iii) #methods to actually update if non_rnk_methods_updated: # we want to update some 'normal'(not RNK) tables, so we need recids update_timestamp = False if not recids: recids = get_modified_or_inserted_recs(non_rnk_methods_updated) if recids == 0: #error signal return False if not recids: write_message("No records inserted or modified in bibrec table " \ "since the last update of bsrMETHODDATA.") return True write_message("These records have been recently modified/inserted: %s" \ %str(recids), verbose=5) update_timestamp = True recids_i = intbitset(recids) for method in non_rnk_methods_updated: task_sleep_now_if_required(can_stop_too=True) task_update_progress("Updating method %s" %method) write_message('Starting updating method %s' %method, verbose=5) executed_ok = update_bibsort_tables(recids_i, method, update_timestamp) if not executed_ok: write_message('Method %s could not be executed correctly, aborting..' \ %method, sys.stderr) return False return True def run_bibsort_rebalance(method_list = None): """Rebalances all buckets for the methods in method_list""" bibsort_methods, errors = get_bibsort_methods_details(method_list) if errors: return False if not bibsort_methods: write_message('No methods found.. exiting rebalancing.') return True #check if there are only ranking methods -> no need for recids rnk_methods = get_rnk_methods(bibsort_methods) non_rnk_method = [method for method in bibsort_methods.keys() if method not in rnk_methods] write_message('Running rebalancing for methods: %s' %bibsort_methods.keys()) if non_rnk_method:# we have also 'normal' (no RNK) methods, so we need the recids recids = get_all_recids(including_deleted=False) write_message('Rebalancing will run for %s records.' \ %str(len(recids)), verbose=5) task_sleep_now_if_required(can_stop_too=True) else: recids = intbitset([]) write_message('Rebalancing will run only for RNK methods') for name in bibsort_methods: task_update_progress('Rebalancing %s method.' %name) write_message('Starting sorting the data for %s method ... ' \ %name.upper()) executed_ok = run_sorting_method(recids, name, bibsort_methods[name]['id'], bibsort_methods[name]['definition'], bibsort_methods[name]['washer']) if not executed_ok: write_message('Method %s could not be executed correctly.' \ %name, sys.stderr) return False write_message('Done.') task_sleep_now_if_required(can_stop_too=True) task_update_progress('Rebalancing done.') return True def main(): """tests""" #print "Running bibsort_rebalance...." #run_bibsort_rebalance() #rebalances everything #print "Running bibsort_rebalance for title and author...." #run_bibsort_rebalance(['title', 'author']) #rebalances only these methods #print "Running bibsort_update...." #run_bibsort_update() #update all the methods #print "Running bibsort_update for title and author...." #run_bibsort_update(method_list = ['title', 'author']) #print "Running bibsort_update for records 1,2,3, title author...." #run_bibsort_update(recids = [1, 2, 3], method_list = ['title', 'author']) if __name__ == '__main__': main() diff --git a/modules/bibupload/lib/bibupload.py b/modules/bibupload/lib/bibupload.py index 5a056b147..07e11cf68 100644 --- a/modules/bibupload/lib/bibupload.py +++ b/modules/bibupload/lib/bibupload.py @@ -1,3003 +1,3003 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 CERN. +# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibUpload: Receive MARC XML file and update the appropriate database tables according to options. """ __revision__ = "$Id$" import os import re import sys import time from datetime import datetime from zlib import compress import socket import marshal import copy import tempfile import urlparse import urllib2 import urllib from invenio.config import CFG_OAI_ID_FIELD, \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, \ CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG, \ CFG_BIBUPLOAD_STRONG_TAGS, \ CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS, \ CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE, \ CFG_BIBUPLOAD_DELETE_FORMATS, \ CFG_SITE_URL, \ CFG_SITE_SECURE_URL, \ CFG_SITE_RECORD, \ CFG_OAI_PROVENANCE_ALTERED_SUBFIELD, \ CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS, \ CFG_BIBUPLOAD_CONFLICTING_REVISION_TICKET_QUEUE, \ CFG_CERN_SITE, \ CFG_BIBUPLOAD_MATCH_DELETED_RECORDS from invenio.jsonutils import json, CFG_JSON_AVAILABLE from invenio.bibupload_config import CFG_BIBUPLOAD_CONTROLFIELD_TAGS, \ CFG_BIBUPLOAD_SPECIAL_TAGS, \ CFG_BIBUPLOAD_DELETE_CODE, \ CFG_BIBUPLOAD_DELETE_VALUE, \ CFG_BIBUPLOAD_OPT_MODES from invenio.dbquery import run_sql from invenio.bibrecord import create_records, \ record_add_field, \ record_delete_field, \ record_xml_output, \ record_get_field_instances, \ record_get_field_value, \ record_get_field_values, \ field_get_subfield_values, \ field_get_subfield_instances, \ record_modify_subfield, \ record_delete_subfield_from, \ record_delete_fields, \ record_add_subfield_into, \ record_find_field, \ record_extract_oai_id, \ record_extract_dois, \ record_has_field, \ records_identical, \ record_drop_duplicate_fields from invenio.search_engine import get_record, record_exists, search_pattern from invenio.dateutils import convert_datestruct_to_datetext from invenio.errorlib import register_exception from invenio.bibcatalog import BIBCATALOG_SYSTEM from invenio.intbitset import intbitset from invenio.urlutils import make_user_agent_string from invenio.config import CFG_BIBDOCFILE_FILEDIR from invenio.bibtask import task_init, write_message, \ task_set_option, task_get_option, task_get_task_param, \ task_update_progress, task_sleep_now_if_required, fix_argv_paths, \ RecoverableError from invenio.bibdocfile import BibRecDocs, file_strip_ext, normalize_format, \ get_docname_from_url, check_valid_url, download_url, \ KEEP_OLD_VALUE, decompose_bibdocfile_url, InvenioBibDocFileError, \ bibdocfile_url_p, CFG_BIBDOCFILE_AVAILABLE_FLAGS, guess_format_from_url, \ BibRelation, MoreInfo from invenio.search_engine import search_pattern from invenio.bibupload_revisionverifier import RevisionVerifier, \ InvenioBibUploadConflictingRevisionsError, \ InvenioBibUploadInvalidRevisionError, \ InvenioBibUploadMissing005Error, \ InvenioBibUploadUnchangedRecordError #Statistic variables stat = {} stat['nb_records_to_upload'] = 0 stat['nb_records_updated'] = 0 stat['nb_records_inserted'] = 0 stat['nb_errors'] = 0 stat['nb_holdingpen'] = 0 stat['exectime'] = time.localtime() _WRITING_RIGHTS = None CFG_BIBUPLOAD_ALLOWED_SPECIAL_TREATMENTS = ('oracle', ) CFG_HAS_BIBCATALOG = "UNKNOWN" def check_bibcatalog(): """ Return True if bibcatalog is available. """ global CFG_HAS_BIBCATALOG # pylint: disable=W0603 if CFG_HAS_BIBCATALOG != "UNKNOWN": return CFG_HAS_BIBCATALOG CFG_HAS_BIBCATALOG = True if BIBCATALOG_SYSTEM is not None: bibcatalog_response = BIBCATALOG_SYSTEM.check_system() else: bibcatalog_response = "No ticket system configured" if bibcatalog_response != "": write_message("BibCatalog error: %s\n" % (bibcatalog_response,)) CFG_HAS_BIBCATALOG = False return CFG_HAS_BIBCATALOG # Let's set a reasonable timeout for URL request (e.g. FFT) socket.setdefaulttimeout(40) def parse_identifier(identifier): """Parse the identifier and determine if it is temporary or fixed""" id_str = str(identifier) if not id_str.startswith("TMP:"): return (False, identifier) else: return (True, id_str[4:]) def resolve_identifier(tmps, identifier): """Resolves an identifier. If the identifier is not temporary, this function is an identity on the second argument. Otherwise, a resolved value is returned or an exception raised""" is_tmp, tmp_id = parse_identifier(identifier) if is_tmp: if not tmp_id in tmps: raise StandardError("Temporary identifier %s not present in the dictionary" % (tmp_id, )) if tmps[tmp_id] == -1: # the identifier has been signalised but never assigned a value - probably error during processing raise StandardError("Temporary identifier %s has been declared, but never assigned a value. Probably an error during processign of an appropriate FFT has happened. Please see the log" % (tmp_id, )) return int(tmps[tmp_id]) else: return int(identifier) _re_find_001 = re.compile('\\s*(\\d*)\\s*', re.S) def bibupload_pending_recids(): """This function embed a bit of A.I. and is more a hack than an elegant algorithm. It should be updated in case bibupload/bibsched are modified in incompatible ways. This function return the intbitset of all the records that are being (or are scheduled to be) touched by other bibuploads. """ options = run_sql("""SELECT arguments FROM schTASK WHERE status<>'DONE' AND proc='bibupload' AND (status='RUNNING' OR status='CONTINUING' OR status='WAITING' OR status='SCHEDULED' OR status='ABOUT TO STOP' OR status='ABOUT TO SLEEP')""") ret = intbitset() xmls = [] if options: for arguments in options: arguments = marshal.loads(arguments[0]) for argument in arguments[1:]: if argument.startswith('/'): # XMLs files are recognizable because they're absolute # files... xmls.append(argument) for xmlfile in xmls: # Let's grep for the 001 try: xml = open(xmlfile).read() ret += [int(group[1]) for group in _re_find_001.findall(xml)] except: continue return ret ### bibupload engine functions: def bibupload(record, opt_mode=None, opt_notimechange=0, oai_rec_id="", pretend=False, tmp_ids=None, tmp_vers=None): """Main function: process a record and fit it in the tables bibfmt, bibrec, bibrec_bibxxx, bibxxx with proper record metadata. Return (error_code, recID) of the processed record. """ if tmp_ids is None: tmp_ids = {} if tmp_vers is None: tmp_vers = {} if opt_mode == 'reference': ## NOTE: reference mode has been deprecated in favour of 'correct' opt_mode = 'correct' assert(opt_mode in CFG_BIBUPLOAD_OPT_MODES) try: record_xml_output(record).decode('utf-8') except UnicodeDecodeError: msg = " Failed: Invalid utf-8 characters." write_message(msg, verbose=1, stream=sys.stderr) return (1, -1, msg) error = None affected_tags = {} original_record = {} rec_old = {} record_modification_date = datetime.now() # will hold record creation/modification date record_had_altered_bit = False is_opt_mode_delete = False # Extraction of the Record Id from 001, SYSNO or OAIID or DOI tags: rec_id = retrieve_rec_id(record, opt_mode, pretend=pretend) if rec_id == -1: msg = " Failed: either the record already exists and insert was " \ "requested or the record does not exists and " \ "replace/correct/append has been used" write_message(msg, verbose=1, stream=sys.stderr) return (1, -1, msg) elif rec_id > 0: write_message(" -Retrieve record ID (found %s): DONE." % rec_id, verbose=2) (unique_p, msg) = check_record_doi_is_unique(rec_id, record) if not unique_p: write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if not record.has_key('001'): # Found record ID by means of SYSNO or OAIID or DOI, and the # input MARCXML buffer does not have this 001 tag, so we # should add it now: error = record_add_field(record, '001', controlfield_value=rec_id) if error is None: msg = " Failed: Error during adding the 001 controlfield " \ "to the record" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: error = None write_message(" -Added tag 001: DONE.", verbose=2) write_message(" -Check if the xml marc file is already in the database: DONE" , verbose=2) record_deleted_p = False record_creation_date = None if opt_mode == 'insert' or \ (opt_mode == 'replace_or_insert') and rec_id is None: insert_mode_p = True # Insert the record into the bibrec databases to have a recordId rec_id = create_new_record(pretend=pretend) write_message(" -Creation of a new record id (%d): DONE" % rec_id, verbose=2) # we add the record Id control field to the record error = record_add_field(record, '001', controlfield_value=rec_id) if error is None: msg = " Failed: Error during adding the 001 controlfield " \ "to the record" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: error = None if '005' not in record: error = record_add_field(record, '005', controlfield_value=record_modification_date.strftime("%Y%m%d%H%M%S.0")) if error is None: msg = " ERROR: during adding to 005 controlfield to record" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: error = None else: write_message(" Note: 005 already existing upon inserting of new record. Keeping it.", verbose=2) record_creation_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(record['005'][0][3].split('.')[0], "%Y%m%d%H%M%S")) elif opt_mode != 'insert': insert_mode_p = False # Update Mode # Retrieve the old record to update rec_old = get_record(rec_id) record_had_altered_bit = record_get_field_values(rec_old, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4], CFG_OAI_PROVENANCE_ALTERED_SUBFIELD) # Also save a copy to restore previous situation in case of errors original_record = get_record(rec_id) if rec_old is None: msg = " Failed during the creation of the old record!" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: write_message(" -Retrieve the old record to update: DONE", verbose=2) # flag to check whether the revisions have been verified and patch generated. # If revision verification failed, then we need to manually identify the affected tags # and process them revision_verified = False rev_verifier = RevisionVerifier() #check for revision conflicts before updating record if record_has_field(record, '005') and not CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS: write_message(" -Upload Record has 005. Verifying Revision", verbose=2) try: rev_res = rev_verifier.verify_revision(record, original_record, opt_mode) if rev_res: opt_mode = rev_res[0] record = rev_res[1] affected_tags = rev_res[2] revision_verified = True write_message(lambda: " -Patch record generated. Changing opt_mode to correct.\nPatch:\n%s " % record_xml_output(record), verbose=2) else: write_message(" -No Patch Record.", verbose=2) except InvenioBibUploadUnchangedRecordError, err: msg = " -ISSUE: %s" % err write_message(msg, verbose=1, stream=sys.stderr) write_message(msg, " Continuing anyway in case there are FFT or other tags") except InvenioBibUploadConflictingRevisionsError, err: msg = " -ERROR: Conflicting Revisions - %s" % err write_message(msg, verbose=1, stream=sys.stderr) submit_ticket_for_holding_pen(rec_id, err, "Conflicting Revisions. Inserting record into holding pen.", pretend=pretend) insert_record_into_holding_pen(record, str(rec_id), pretend=pretend) return (2, int(rec_id), msg) except InvenioBibUploadInvalidRevisionError, err: msg = " -ERROR: Invalid Revision - %s" % err write_message(msg) submit_ticket_for_holding_pen(rec_id, err, "Invalid Revisions. Inserting record into holding pen.", pretend=pretend) insert_record_into_holding_pen(record, str(rec_id), pretend=pretend) return (2, int(rec_id), msg) except InvenioBibUploadMissing005Error, err: msg = " -ERROR: Missing 005 - %s" % err write_message(msg) submit_ticket_for_holding_pen(rec_id, err, "Missing 005. Inserting record into holding pen.", pretend=pretend) insert_record_into_holding_pen(record, str(rec_id), pretend=pretend) return (2, int(rec_id), msg) else: write_message(" - No 005 Tag Present. Resuming normal flow.", verbose=2) # dictionaries to temporarily hold original recs tag-fields existing_tags = {} retained_tags = {} # in case of delete operation affected tags should be deleted in delete_bibrec_bibxxx # but should not be updated again in STAGE 4 # utilising the below flag is_opt_mode_delete = False if not revision_verified: # either 005 was not present or opt_mode was not correct/replace # in this case we still need to find out affected tags to process write_message(" - Missing 005 or opt_mode!=Replace/Correct.Revision Verifier not called.", verbose=2) # Identify affected tags if opt_mode == 'correct' or opt_mode == 'replace' or opt_mode == 'replace_or_insert': rec_diff = rev_verifier.compare_records(record, original_record, opt_mode) affected_tags = rev_verifier.retrieve_affected_tags_with_ind(rec_diff) elif opt_mode == 'delete': # populate an intermediate dictionary # used in upcoming step related to 'delete' mode is_opt_mode_delete = True for tag, fields in original_record.iteritems(): existing_tags[tag] = [tag + (field[1] != ' ' and field[1] or '_') + (field[2] != ' ' and field[2] or '_') for field in fields] elif opt_mode == 'append': for tag, fields in record.iteritems(): if tag not in CFG_BIBUPLOAD_CONTROLFIELD_TAGS: affected_tags[tag] = [(field[1], field[2]) for field in fields] # In Replace mode, take over old strong tags if applicable: if opt_mode == 'replace' or \ opt_mode == 'replace_or_insert': copy_strong_tags_from_old_record(record, rec_old) # Delete tags to correct in the record if opt_mode == 'correct': delete_tags_to_correct(record, rec_old) write_message(" -Delete the old tags to correct in the old record: DONE", verbose=2) # Delete tags specified if in delete mode if opt_mode == 'delete': record = delete_tags(record, rec_old) for tag, fields in record.iteritems(): retained_tags[tag] = [tag + (field[1] != ' ' and field[1] or '_') + (field[2] != ' ' and field[2] or '_') for field in fields] #identify the tags that have been deleted for tag in existing_tags.keys(): if tag not in retained_tags: for item in existing_tags[tag]: tag_to_add = item[0:3] ind1, ind2 = item[3], item[4] if tag_to_add in affected_tags and (ind1, ind2) not in affected_tags[tag_to_add]: affected_tags[tag_to_add].append((ind1, ind2)) else: affected_tags[tag_to_add] = [(ind1, ind2)] else: deleted = list(set(existing_tags[tag]) - set(retained_tags[tag])) for item in deleted: tag_to_add = item[0:3] ind1, ind2 = item[3], item[4] if tag_to_add in affected_tags and (ind1, ind2) not in affected_tags[tag_to_add]: affected_tags[tag_to_add].append((ind1, ind2)) else: affected_tags[tag_to_add] = [(ind1, ind2)] write_message(" -Delete specified tags in the old record: DONE", verbose=2) # Append new tag to the old record and update the new record with the old_record modified if opt_mode == 'append' or opt_mode == 'correct': record = append_new_tag_to_old_record(record, rec_old) write_message(" -Append new tags to the old record: DONE", verbose=2) write_message(" -Affected Tags found after comparing upload and original records: %s"%(str(affected_tags)), verbose=2) # 005 tag should be added everytime the record is modified # If an exiting record is modified, its 005 tag should be overwritten with a new revision value if record.has_key('005'): record_delete_field(record, '005') write_message(" Deleted the existing 005 tag.", verbose=2) last_revision = run_sql("SELECT MAX(job_date) FROM hstRECORD WHERE id_bibrec=%s", (rec_id, ))[0][0] if last_revision and last_revision.strftime("%Y%m%d%H%M%S.0") == record_modification_date.strftime("%Y%m%d%H%M%S.0"): ## We are updating the same record within the same seconds! It's less than ## the minimal granularity. Let's pause for 1 more second to take a breath :-) time.sleep(1) record_modification_date = datetime.now() error = record_add_field(record, '005', controlfield_value=record_modification_date.strftime("%Y%m%d%H%M%S.0")) if error is None: write_message(" Failed: Error during adding to 005 controlfield to record", verbose=1, stream=sys.stderr) return (1, int(rec_id)) else: error=None write_message(lambda: " -Added tag 005: DONE. " + str(record_get_field_value(record, '005', '', '')), verbose=2) # adding 005 to affected tags will delete the existing 005 entry # and update with the latest timestamp. if '005' not in affected_tags: affected_tags['005'] = [(' ', ' ')] write_message(" -Stage COMPLETED", verbose=2) record_deleted_p = False try: if not record_is_valid(record): msg = "ERROR: record is not valid" write_message(msg, verbose=1, stream=sys.stderr) return (1, -1, msg) # Have a look if we have FFT tags write_message("Stage 2: Start (Process FFT tags if exist).", verbose=2) record_had_FFT = False bibrecdocs = None if extract_tag_from_record(record, 'FFT') is not None: record_had_FFT = True if not writing_rights_p(): msg = "ERROR: no rights to write fulltext files" write_message(" Stage 2 failed: %s" % msg, verbose=1, stream=sys.stderr) raise StandardError(msg) try: bibrecdocs = BibRecDocs(rec_id) record = elaborate_fft_tags(record, rec_id, opt_mode, pretend=pretend, tmp_ids=tmp_ids, tmp_vers=tmp_vers, bibrecdocs=bibrecdocs) except Exception, e: register_exception() msg = " Stage 2 failed: ERROR: while elaborating FFT tags: %s" % e write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if record is None: msg = " Stage 2 failed: ERROR: while elaborating FFT tags" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Have a look if we have FFT tags write_message("Stage 2B: Start (Synchronize 8564 tags).", verbose=2) if record_had_FFT or extract_tag_from_record(record, '856') is not None: try: if bibrecdocs is None: bibrecdocs = BibRecDocs(rec_id) record = synchronize_8564(rec_id, record, record_had_FFT, bibrecdocs, pretend=pretend) # in case if FFT is in affected list make appropriate changes if not insert_mode_p: # because for insert, all tags are affected if ('4', ' ') not in affected_tags.get('856', []): if '856' not in affected_tags: affected_tags['856'] = [('4', ' ')] elif ('4', ' ') not in affected_tags['856']: affected_tags['856'].append(('4', ' ')) write_message(" -Modified field list updated with FFT details: %s" % str(affected_tags), verbose=2) except Exception, e: register_exception(alert_admin=True) msg = " Stage 2B failed: ERROR: while synchronizing 8564 tags: %s" % e write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if record is None: msg = " Stage 2B failed: ERROR: while synchronizing 8564 tags" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) write_message("Stage 3: Start (Apply fields deletion requests).", verbose=2) write_message(lambda: " Record before deletion:\n%s" % record_xml_output(record), verbose=9) # remove fields with __DELETE_FIELDS__ # NOTE:creating a temporary deep copy of record for iteration to avoid RunTimeError # RuntimeError due to change in dictionary size during iteration tmp_rec = copy.deepcopy(record) for tag in tmp_rec: for data_tuple in record[tag]: if (CFG_BIBUPLOAD_DELETE_CODE, CFG_BIBUPLOAD_DELETE_VALUE) in data_tuple[0]: # delete the tag with particular indicator pairs from original record record_delete_field(record, tag, data_tuple[1], data_tuple[2]) write_message(lambda: " Record after cleaning up fields to be deleted:\n%s" % record_xml_output(record), verbose=9) if opt_mode == 'append': write_message("Stage 3b: Drop duplicate fields in append mode.", verbose=2) record = record_drop_duplicate_fields(record) write_message(lambda: " Record after dropping duplicate fields:\n%s" % record_xml_output(record), verbose=9) # Update of the BibFmt write_message("Stage 4: Start (Update bibfmt).", verbose=2) updates_exist = not records_identical(record, original_record) if updates_exist: # if record_had_altered_bit, this must be set to true, since the # record has been altered. if record_had_altered_bit: oai_provenance_fields = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) for oai_provenance_field in oai_provenance_fields: for i, (code, dummy_value) in enumerate(oai_provenance_field[0]): if code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD: oai_provenance_field[0][i] = (code, 'true') tmp_indicators = (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3] == '_' and ' ' or CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4] == '_' and ' ' or CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) if tmp_indicators not in affected_tags.get(CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], []): if CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3] not in affected_tags: affected_tags[CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]] = [tmp_indicators] else: affected_tags[CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]].append(tmp_indicators) write_message(lambda: " Updates exists:\n%s\n!=\n%s" % (record, original_record), verbose=9) # format the single record as xml rec_xml_new = record_xml_output(record) # Update bibfmt with the format xm of this record modification_date = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(record_get_field_value(record, '005'), '%Y%m%d%H%M%S.0')) error = update_bibfmt_format(rec_id, rec_xml_new, 'xm', modification_date, pretend=pretend) if error == 1: msg = " Failed: ERROR: during update_bibfmt_format 'xm'" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE: error = update_bibfmt_format(rec_id, marshal.dumps(record), 'recstruct', modification_date, pretend=pretend) if error == 1: msg = " Failed: ERROR: during update_bibfmt_format 'recstruct'" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if not CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS: # archive MARCXML format of this record for version history purposes: if insert_mode_p: error = archive_marcxml_for_history(rec_id, affected_fields={}, pretend=pretend) else: error = archive_marcxml_for_history(rec_id, affected_fields=affected_tags, pretend=pretend) if error == 1: msg = " ERROR: Failed to archive MARCXML for history" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: write_message(" -Archived MARCXML for history: DONE", verbose=2) # delete some formats like HB upon record change: if updates_exist or record_had_FFT: for format_to_delete in CFG_BIBUPLOAD_DELETE_FORMATS: try: delete_bibfmt_format(rec_id, format_to_delete, pretend=pretend) except: # OK, some formats like HB could not have been deleted, no big deal pass write_message(" -Stage COMPLETED", verbose=2) ## Let's assert that one and only one 005 tag is existing at this stage. assert len(record['005']) == 1 # Update the database MetaData write_message("Stage 5: Start (Update the database with the metadata).", verbose=2) if insert_mode_p: update_database_with_metadata(record, rec_id, oai_rec_id, pretend=pretend) write_message(" -Stage COMPLETED", verbose=2) elif opt_mode in ('replace', 'replace_or_insert', 'append', 'correct', 'delete') and updates_exist: # now we clear all the rows from bibrec_bibxxx from the old record_deleted_p = True delete_bibrec_bibxxx(rec_old, rec_id, affected_tags, pretend=pretend) # metadata update will insert tags that are available in affected_tags. # but for delete, once the tags have been deleted from bibrec_bibxxx, they dont have to be inserted # except for 005. if is_opt_mode_delete: tmp_affected_tags = copy.deepcopy(affected_tags) for tag in tmp_affected_tags: if tag != '005': affected_tags.pop(tag) write_message(" -Clean bibrec_bibxxx: DONE", verbose=2) update_database_with_metadata(record, rec_id, oai_rec_id, affected_tags, pretend=pretend) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED in mode %s" % opt_mode, verbose=2) record_deleted_p = False # Finally we update the bibrec table with the current date write_message("Stage 6: Start (Update bibrec table with current date).", verbose=2) if opt_notimechange == 0 and (updates_exist or record_had_FFT): record_modification_date = convert_datestruct_to_datetext(time.localtime()) write_message(" -Retrieved current localtime: DONE", verbose=2) update_bibrec_date(record_modification_date, rec_id, insert_mode_p, record_creation_date, pretend=pretend) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Increase statistics if insert_mode_p: stat['nb_records_inserted'] += 1 else: stat['nb_records_updated'] += 1 # Upload of this record finish write_message("Record "+str(rec_id)+" DONE", verbose=1) return (0, int(rec_id), "") finally: if record_deleted_p: ## BibUpload has failed living the record deleted. We should ## back the original record then. update_database_with_metadata(original_record, rec_id, oai_rec_id, pretend=pretend) write_message(" Restored original record", verbose=1, stream=sys.stderr) def record_is_valid(record): """ Check if the record is valid. Currently this simply checks if the record has exactly one rec_id. @param record: the record @type record: recstruct @return: True if the record is valid @rtype: bool """ rec_ids = record_get_field_values(record, tag="001") if len(rec_ids) != 1: write_message(" The record is not valid: it has not a single rec_id: %s" % (rec_ids), stream=sys.stderr) return False return True def find_record_ids_by_oai_id(oaiId): """ A method finding the records identifier provided the oai identifier returns a list of identifiers matching a given oai identifier """ # Is this record already in invenio (matching by oaiid) if oaiId: recids = search_pattern(p=oaiId, f=CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, m='e') # Is this record already in invenio (matching by reportnumber i.e. # particularly 037. Idea: to avoid double insertions) repnumber = oaiId.split(":")[-1] if repnumber: recids |= search_pattern(p = repnumber, f = "reportnumber", m = 'e' ) # Is this record already in invenio (matching by reportnumber i.e. # particularly 037. Idea: to avoid double insertions) repnumber = "arXiv:" + oaiId.split(":")[-1] recids |= search_pattern(p = repnumber, f = "reportnumber", m = 'e' ) if CFG_BIBUPLOAD_MATCH_DELETED_RECORDS: return recids else: if CFG_CERN_SITE: return recids - (search_pattern(p='DELETED', f='980__%', m='e') | search_pattern(p='DUMMY', f='980__%', m='e')) else: return recids - search_pattern(p='DELETED', f='980__%', m='e') else: return intbitset() def bibupload_post_phase(record, mode=None, rec_id="", pretend=False, tmp_ids=None, tmp_vers=None): def _elaborate_tag(record, tag, fun): if extract_tag_from_record(record, tag) is not None: try: record = fun() except Exception, e: register_exception() write_message(" Stage failed: ERROR: while elaborating %s tags: %s" % (tag, e), verbose=1, stream=sys.stderr) return (1, int(rec_id)) # TODO: ? if record is None: write_message(" Stage failed: ERROR: while elaborating %s tags" % (tag, ), verbose=1, stream=sys.stderr) return (1, int(rec_id)) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) if tmp_ids is None: tmp_ids = {} if tmp_vers is None: tmp_vers = {} _elaborate_tag(record, "BDR", lambda: elaborate_brt_tags(record, rec_id = rec_id, mode = mode, pretend = pretend, tmp_ids = tmp_ids, tmp_vers = tmp_vers)) _elaborate_tag(record, "BDM", lambda: elaborate_mit_tags(record, rec_id = rec_id, mode = mode, pretend = pretend, tmp_ids = tmp_ids, tmp_vers = tmp_vers)) def submit_ticket_for_holding_pen(rec_id, err, msg, pretend=False): """ Submit a ticket via BibCatalog to report about a record that has been put into the Holding Pen. @rec_id: the affected record @err: the corresponding Exception msg: verbose message """ from invenio import bibtask from invenio.webuser import get_email_from_username, get_uid_from_email user = task_get_task_param("user") uid = None if user: try: uid = get_uid_from_email(get_email_from_username(user)) except Exception, err: write_message("WARNING: can't reliably retrieve uid for user %s: %s" % (user, err), stream=sys.stderr) if check_bibcatalog(): text = """ %(msg)s found for record %(rec_id)s: %(err)s See: <%(siteurl)s/record/edit/#state=edit&recid=%(rec_id)s> BibUpload task information: task_id: %(task_id)s task_specific_name: %(task_specific_name)s user: %(user)s task_params: %(task_params)s task_options: %(task_options)s""" % { "msg": msg, "rec_id": rec_id, "err": err, "siteurl": CFG_SITE_SECURE_URL, "task_id": task_get_task_param("task_id"), "task_specific_name": task_get_task_param("task_specific_name"), "user": user, "task_params": bibtask._TASK_PARAMS, "task_options": bibtask._OPTIONS} if not pretend: BIBCATALOG_SYSTEM.ticket_submit(subject="%s: %s by %s" % (msg, rec_id, user), recordid=rec_id, text=text, queue=CFG_BIBUPLOAD_CONFLICTING_REVISION_TICKET_QUEUE, owner=uid) def insert_record_into_holding_pen(record, oai_id, pretend=False): query = "INSERT INTO bibHOLDINGPEN (oai_id, changeset_date, changeset_xml, id_bibrec) VALUES (%s, NOW(), %s, %s)" xml_record = record_xml_output(record) bibrec_ids = find_record_ids_by_oai_id(oai_id) # here determining the identifier of the record if len(bibrec_ids) > 0: bibrec_id = bibrec_ids.pop() else: # id not found by using the oai_id, let's use a wider search based # on any information we might have. bibrec_id = retrieve_rec_id(record, 'holdingpen', pretend=pretend) if bibrec_id is None: bibrec_id = 0 if not pretend: run_sql(query, (oai_id, compress(xml_record), bibrec_id)) # record_id is logged as 0! ( We are not inserting into the main database) log_record_uploading(oai_id, task_get_task_param('task_id', 0), 0, 'H', pretend=pretend) stat['nb_holdingpen'] += 1 def print_out_bibupload_statistics(): """Print the statistics of the process""" out = "Task stats: %(nb_input)d input records, %(nb_updated)d updated, " \ "%(nb_inserted)d inserted, %(nb_errors)d errors, %(nb_holdingpen)d inserted to holding pen. " \ "Time %(nb_sec).2f sec." % { \ 'nb_input': stat['nb_records_to_upload'], 'nb_updated': stat['nb_records_updated'], 'nb_inserted': stat['nb_records_inserted'], 'nb_errors': stat['nb_errors'], 'nb_holdingpen': stat['nb_holdingpen'], 'nb_sec': time.time() - time.mktime(stat['exectime']) } write_message(out) def open_marc_file(path): """Open a file and return the data""" try: # open the file containing the marc document marc_file = open(path, 'r') marc = marc_file.read() marc_file.close() except IOError, erro: write_message("ERROR: %s" % erro, verbose=1, stream=sys.stderr) if erro.errno == 2: # No such file or directory # Not scary e = RecoverableError('File does not exist: %s' % path) else: e = StandardError('File not accessible: %s' % path) raise e return marc def xml_marc_to_records(xml_marc): """create the records""" # Creation of the records from the xml Marc in argument recs = create_records(xml_marc, 1, 1) if recs == []: msg = "ERROR: Cannot parse MARCXML file." write_message(msg, verbose=1, stream=sys.stderr) raise StandardError(msg) elif recs[0][0] is None: msg = "ERROR: MARCXML file has wrong format: %s" % recs write_message(msg, verbose=1, stream=sys.stderr) raise RecoverableError(msg) else: recs = map((lambda x:x[0]), recs) return recs def find_record_format(rec_id, bibformat): """Look whether record REC_ID is formatted in FORMAT, i.e. whether FORMAT exists in the bibfmt table for this record. Return the number of times it is formatted: 0 if not, 1 if yes, 2 if found more than once (should never occur). """ out = 0 query = """SELECT COUNT(*) FROM bibfmt WHERE id_bibrec=%s AND format=%s""" params = (rec_id, bibformat) res = [] res = run_sql(query, params) out = res[0][0] return out def find_record_from_recid(rec_id): """ Try to find record in the database from the REC_ID number. Return record ID if found, None otherwise. """ res = run_sql("SELECT id FROM bibrec WHERE id=%s", (rec_id,)) if res: return res[0][0] else: return None def find_record_from_sysno(sysno): """ Try to find record in the database from the external SYSNO number. Return record ID if found, None otherwise. """ bibxxx = 'bib'+CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:2]+'x' bibrec_bibxxx = 'bibrec_' + bibxxx res = run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, (CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, sysno,)) for recid in res: if CFG_BIBUPLOAD_MATCH_DELETED_RECORDS: return recid[0] else: if record_exists(recid[0]) > 0: ## Only non deleted records return recid[0] return None def find_records_from_extoaiid(extoaiid, extoaisrc=None): """ Try to find records in the database from the external EXTOAIID number. Return list of record ID if found, None otherwise. """ assert(CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:5] == CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[:5]) bibxxx = 'bib'+CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:2]+'x' bibrec_bibxxx = 'bibrec_' + bibxxx write_message(' Looking for extoaiid="%s" with extoaisrc="%s"' % (extoaiid, extoaisrc), verbose=9) id_bibrecs = intbitset(run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, extoaiid,))) write_message(' Partially found %s for extoaiid="%s"' % (id_bibrecs, extoaiid), verbose=9) ret = intbitset() for id_bibrec in id_bibrecs: if not CFG_BIBUPLOAD_MATCH_DELETED_RECORDS: if record_exists(id_bibrec) < 1: ## We don't match not existing records continue record = get_record(id_bibrec) instances = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) write_message(' recid %s -> instances "%s"' % (id_bibrec, instances), verbose=9) for instance in instances: this_extoaisrc = field_get_subfield_values(instance, CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5]) this_extoaisrc = this_extoaisrc and this_extoaisrc[0] or None this_extoaiid = field_get_subfield_values(instance, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]) this_extoaiid = this_extoaiid and this_extoaiid[0] or None write_message(" this_extoaisrc -> %s, this_extoaiid -> %s" % (this_extoaisrc, this_extoaiid), verbose=9) if this_extoaiid == extoaiid: write_message(' recid %s -> provenance "%s"' % (id_bibrec, this_extoaisrc), verbose=9) if this_extoaisrc == extoaisrc: write_message('Found recid %s for extoaiid="%s" with provenance="%s"' % (id_bibrec, extoaiid, extoaisrc), verbose=9) ret.add(id_bibrec) break if this_extoaisrc is None: write_message('WARNING: Found recid %s for extoaiid="%s" that doesn\'t specify any provenance, while input record does.' % (id_bibrec, extoaiid), stream=sys.stderr) if extoaisrc is None: write_message('WARNING: Found recid %s for extoaiid="%s" that specify a provenance (%s), while input record does not have a provenance.' % (id_bibrec, extoaiid, this_extoaisrc), stream=sys.stderr) return ret def find_record_from_oaiid(oaiid): """ Try to find record in the database from the OAI ID number and OAI SRC. Return record ID if found, None otherwise. """ bibxxx = 'bib'+CFG_OAI_ID_FIELD[0:2]+'x' bibrec_bibxxx = 'bibrec_' + bibxxx res = run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, (CFG_OAI_ID_FIELD, oaiid,)) for recid in res: if CFG_BIBUPLOAD_MATCH_DELETED_RECORDS: return recid[0] else: if record_exists(recid[0]) > 0: ## Only non deleted records return recid[0] return None def find_record_from_doi(doi): """ Try to find record in the database from the given DOI. Return record ID if found, None otherwise. """ bibxxx = 'bib02x' bibrec_bibxxx = 'bibrec_' + bibxxx res = run_sql("""SELECT bb.id_bibrec, bb.field_number FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, ('0247_a', doi,)) # For each of the result, make sure that it is really tagged as doi for (id_bibrec, field_number) in res: if not CFG_BIBUPLOAD_MATCH_DELETED_RECORDS: if record_exists(id_bibrec) < 1: ## We don't match not existing records continue res = run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id and bb.field_number=%%s and bb.id_bibrec=%%s""" % {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, ('0247_2', "doi", field_number, id_bibrec)) if res and res[0][0] == id_bibrec: return res[0][0] return None def extract_tag_from_record(record, tag_number): """ Extract the tag_number for record.""" # first step verify if the record is not already in the database if record: return record.get(tag_number, None) return None def retrieve_rec_id(record, opt_mode, pretend=False, post_phase = False): """Retrieve the record Id from a record by using tag 001 or SYSNO or OAI ID or DOI tag. opt_mod is the desired mode. @param post_phase Tells if we are calling this method in the postprocessing phase. If true, we accept presence of 001 fields even in the insert mode @type post_phase boolean """ rec_id = None # 1st step: we look for the tag 001 tag_001 = extract_tag_from_record(record, '001') if tag_001 is not None: # We extract the record ID from the tag rec_id = tag_001[0][3] # if we are in insert mode => error if opt_mode == 'insert' and not post_phase: write_message(" Failed: tag 001 found in the xml" \ " submitted, you should use the option replace," \ " correct or append to replace an existing" \ " record. (-h for help)", verbose=1, stream=sys.stderr) return -1 else: # we found the rec id and we are not in insert mode => continue # we try to match rec_id against the database: if find_record_from_recid(rec_id) is not None: # okay, 001 corresponds to some known record return int(rec_id) elif opt_mode in ('replace', 'replace_or_insert'): if task_get_option('force'): # we found the rec_id but it's not in the system and we are # requested to replace records. Therefore we create on the fly # a empty record allocating the recid. write_message(" WARNING: tag 001 found in the xml with" " value %(rec_id)s, but rec_id %(rec_id)s does" " not exist. Since the mode replace was" " requested the rec_id %(rec_id)s is allocated" " on-the-fly." % {"rec_id": rec_id}, stream=sys.stderr) return create_new_record(rec_id=rec_id, pretend=pretend) else: # Since --force was not used we are going to raise an error write_message(" Failed: tag 001 found in the xml" " submitted with value %(rec_id)s. The" " corresponding record however does not" " exists. If you want to really create" " such record, please use the --force" " parameter when calling bibupload." % { "rec_id": rec_id}, stream=sys.stderr) return -1 else: # The record doesn't exist yet. We shall have try to check # the SYSNO or OAI or DOI id later. write_message(" -Tag 001 value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag 001 not found in the xml marc file.", verbose=9) if rec_id is None: # 2nd step we look for the SYSNO sysnos = record_get_field_values(record, CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3], CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or "", CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or "", CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6]) if sysnos: sysno = sysnos[0] # there should be only one external SYSNO write_message(" -Checking if SYSNO " + sysno + \ " exists in the database", verbose=9) # try to find the corresponding rec id from the database rec_id = find_record_from_sysno(sysno) if rec_id is not None: # rec_id found pass else: # The record doesn't exist yet. We will try to check # external and internal OAI ids later. write_message(" -Tag SYSNO value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag SYSNO not found in the xml marc file.", verbose=9) if rec_id is None: # 2nd step we look for the external OAIID extoai_fields = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or "", CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or "") if extoai_fields: for field in extoai_fields: extoaiid = field_get_subfield_values(field, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6]) extoaisrc = field_get_subfield_values(field, CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6]) if extoaiid: extoaiid = extoaiid[0] if extoaisrc: extoaisrc = extoaisrc[0] else: extoaisrc = None write_message(" -Checking if EXTOAIID %s (%s) exists in the database" % (extoaiid, extoaisrc), verbose=9) # try to find the corresponding rec id from the database rec_ids = find_records_from_extoaiid(extoaiid, extoaisrc) if rec_ids: # rec_id found rec_id = rec_ids.pop() break else: # The record doesn't exist yet. We will try to check # OAI id later. write_message(" -Tag EXTOAIID value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag EXTOAIID not found in the xml marc file.", verbose=9) if rec_id is None: # 4th step we look for the OAI ID oaiidvalues = record_get_field_values(record, CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4] != "_" and \ CFG_OAI_ID_FIELD[3:4] or "", CFG_OAI_ID_FIELD[4:5] != "_" and \ CFG_OAI_ID_FIELD[4:5] or "", CFG_OAI_ID_FIELD[5:6]) if oaiidvalues: oaiid = oaiidvalues[0] # there should be only one OAI ID write_message(" -Check if local OAI ID " + oaiid + \ " exist in the database", verbose=9) # try to find the corresponding rec id from the database rec_id = find_record_from_oaiid(oaiid) if rec_id is not None: # rec_id found pass else: write_message(" -Tag OAI ID value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag SYSNO not found in the xml marc file.", verbose=9) if rec_id is None: # 5th step we look for the DOI. record_dois = record_extract_dois(record) matching_recids = set() if record_dois: # try to find the corresponding rec id from the database for record_doi in record_dois: possible_recid = find_record_from_doi(record_doi) if possible_recid: matching_recids.add(possible_recid) if len(matching_recids) > 1: # Oops, this record refers to DOI existing in multiple records. # Dunno which one to choose. write_message(" Failed: Multiple records found in the" \ " database %s that match the DOI(s) in the input" \ " MARCXML %s" % (repr(matching_recids), repr(record_dois)), verbose=1, stream=sys.stderr) return -1 elif len(matching_recids) == 1: rec_id = matching_recids.pop() if opt_mode == 'insert': write_message(" Failed: DOI tag matching record #%s found in the xml" \ " submitted, you should use the option replace," \ " correct or append to replace an existing" \ " record. (-h for help)" % rec_id, verbose=1, stream=sys.stderr) return -1 else: write_message(" - Tag DOI value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag DOI not found in the xml marc file.", verbose=9) # Now we should have detected rec_id from SYSNO or OAIID # tags. (None otherwise.) if rec_id: if opt_mode == 'insert': write_message(" Failed: Record found in the database," \ " you should use the option replace," \ " correct or append to replace an existing" \ " record. (-h for help)", verbose=1, stream=sys.stderr) return -1 else: if opt_mode != 'insert' and \ opt_mode != 'replace_or_insert': write_message(" Failed: Record not found in the database."\ " Please insert the file before updating it."\ " (-h for help)", verbose=1, stream=sys.stderr) return -1 return rec_id and int(rec_id) or None def check_record_doi_is_unique(rec_id, record): """ Check that DOI found in 'record' does not exist in any other record than 'recid'. Return (boolean, msg) where 'boolean' would be True if the DOI is unique. """ record_dois = record_extract_dois(record) if record_dois: matching_recids = set() for record_doi in record_dois: possible_recid = find_record_from_doi(record_doi) if possible_recid: matching_recids.add(possible_recid) if len(matching_recids) > 1: # Oops, this record refers to DOI existing in multiple records. msg = " Failed: Multiple records found in the" \ " database %s that match the DOI(s) in the input" \ " MARCXML %s" % (repr(matching_recids), repr(record_dois)) return (False, msg) elif len(matching_recids) == 1: matching_recid = matching_recids.pop() if str(matching_recid) != str(rec_id): # Oops, this record refers to DOI existing in a different record. msg = " Failed: DOI(s) %s found in this record (#%s)" \ " already exist(s) in another other record (#%s)" % \ (repr(record_dois), rec_id, matching_recid) return (False, msg) return (True, "") ### Insert functions def create_new_record(rec_id=None, pretend=False): """ Create new record in the database @param rec_id: if specified the new record will have this rec_id. @type rec_id: int @return: the allocated rec_id @rtype: int @note: in case of errors will be returned None """ if rec_id is not None: try: rec_id = int(rec_id) except (ValueError, TypeError), error: write_message(" ERROR: during the creation_new_record function: %s " % error, verbose=1, stream=sys.stderr) return None if run_sql("SELECT id FROM bibrec WHERE id=%s", (rec_id, )): write_message(" ERROR: during the creation_new_record function: the requested rec_id %s already exists." % rec_id) return None if pretend: if rec_id: return rec_id else: return run_sql("SELECT max(id)+1 FROM bibrec")[0][0] if rec_id is not None: return run_sql("INSERT INTO bibrec (id, creation_date, modification_date) VALUES (%s, NOW(), NOW())", (rec_id, )) else: return run_sql("INSERT INTO bibrec (creation_date, modification_date) VALUES (NOW(), NOW())") def insert_bibfmt(id_bibrec, marc, bibformat, modification_date='1970-01-01 00:00:00', pretend=False): """Insert the format in the table bibfmt""" # compress the marc value pickled_marc = compress(marc) try: time.strptime(modification_date, "%Y-%m-%d %H:%M:%S") except ValueError: modification_date = '1970-01-01 00:00:00' query = """INSERT LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) - VALUES (%s, %s, %s, %s)""" + VALUES (%s, %s, %s, _binary %s)""" if not pretend: row_id = run_sql(query, (id_bibrec, bibformat, modification_date, pickled_marc)) return row_id else: return 1 def insert_record_bibxxx(tag, value, pretend=False): """Insert the record into bibxxx""" # determine into which table one should insert the record table_name = 'bib'+tag[0:2]+'x' # check if the tag, value combination exists in the table query = """SELECT id,value FROM %s """ % table_name query += """ WHERE tag=%s AND value=%s""" params = (tag, value) res = None res = run_sql(query, params) # Note: compare now the found values one by one and look for # string binary equality (e.g. to respect lowercase/uppercase # match), regardless of the charset etc settings. Ideally we # could use a BINARY operator in the above SELECT statement, but # we would have to check compatibility on various MySQLdb versions # etc; this approach checks all matched values in Python, not in # MySQL, which is less cool, but more conservative, so it should # work better on most setups. if res: for row in res: row_id = row[0] row_value = row[1] if row_value == value: return (table_name, row_id) # We got here only when the tag, value combination was not found, # so it is now necessary to insert the tag, value combination into # bibxxx table as new. query = """INSERT INTO %s """ % table_name query += """ (tag, value) values (%s , %s)""" params = (tag, value) if not pretend: row_id = run_sql(query, params) else: return (table_name, 1) return (table_name, row_id) def insert_record_bibrec_bibxxx(table_name, id_bibxxx, field_number, id_bibrec, pretend=False): """Insert the record into bibrec_bibxxx""" # determine into which table one should insert the record full_table_name = 'bibrec_'+ table_name # insert the proper row into the table query = """INSERT INTO %s """ % full_table_name query += """(id_bibrec,id_bibxxx, field_number) values (%s , %s, %s)""" params = (id_bibrec, id_bibxxx, field_number) if not pretend: res = run_sql(query, params) else: return 1 return res def synchronize_8564(rec_id, record, record_had_FFT, bibrecdocs, pretend=False): """ Synchronize 8564_ tags and BibDocFile tables. This function directly manipulate the record parameter. @type rec_id: positive integer @param rec_id: the record identifier. @param record: the record structure as created by bibrecord.create_record @type record_had_FFT: boolean @param record_had_FFT: True if the incoming bibuploaded-record used FFT @return: the manipulated record (which is also modified as a side effect) """ def merge_marc_into_bibdocfile(field, pretend=False): """ Internal function that reads a single field and stores its content in BibDocFile tables. @param field: the 8564_ field containing a BibDocFile URL. """ write_message('Merging field: %s' % (field, ), verbose=9) url = field_get_subfield_values(field, 'u')[:1] or field_get_subfield_values(field, 'q')[:1] description = field_get_subfield_values(field, 'y')[:1] comment = field_get_subfield_values(field, 'z')[:1] if url: recid, docname, docformat = decompose_bibdocfile_url(url[0]) if recid != rec_id: write_message("INFO: URL %s is not pointing to a fulltext owned by this record (%s)" % (url, recid), stream=sys.stderr) else: try: bibdoc = bibrecdocs.get_bibdoc(docname) if description and not pretend: bibdoc.set_description(description[0], docformat) if comment and not pretend: bibdoc.set_comment(comment[0], docformat) except InvenioBibDocFileError: ## Apparently the referenced docname doesn't exist anymore. ## Too bad. Let's skip it. write_message("WARNING: docname %s does not seem to exist for record %s. Has it been renamed outside FFT?" % (docname, recid), stream=sys.stderr) def merge_bibdocfile_into_marc(field, subfields): """ Internal function that reads BibDocFile table entries referenced by the URL in the given 8564_ field and integrate the given information directly with the provided subfields. @param field: the 8564_ field containing a BibDocFile URL. @param subfields: the subfields corresponding to the BibDocFile URL generated after BibDocFile tables. """ write_message('Merging subfields %s into field %s' % (subfields, field), verbose=9) subfields = dict(subfields) ## We make a copy not to have side-effects subfield_to_delete = [] for subfield_position, (code, value) in enumerate(field_get_subfield_instances(field)): ## For each subfield instance already existing... if code in subfields: ## ...We substitute it with what is in BibDocFile tables record_modify_subfield(record, '856', code, subfields[code], subfield_position, field_position_global=field[4]) del subfields[code] else: ## ...We delete it otherwise subfield_to_delete.append(subfield_position) subfield_to_delete.sort() for counter, position in enumerate(subfield_to_delete): ## FIXME: Very hackish algorithm. Since deleting a subfield ## will alterate the position of following subfields, we ## are taking note of this and adjusting further position ## by using a counter. record_delete_subfield_from(record, '856', position - counter, field_position_global=field[4]) subfields = subfields.items() subfields.sort() for code, value in subfields: ## Let's add non-previously existing subfields record_add_subfield_into(record, '856', code, value, field_position_global=field[4]) def get_bibdocfile_managed_info(): """ Internal function, returns a dictionary of BibDocFile URL -> wanna-be subfields. This information is retrieved from internal BibDoc structures rather than from input MARC XML files @rtype: mapping @return: BibDocFile URL -> wanna-be subfields dictionary """ ret = {} latest_files = bibrecdocs.list_latest_files(list_hidden=False) for afile in latest_files: url = afile.get_url() ret[url] = {'u': url} description = afile.get_description() comment = afile.get_comment() subformat = afile.get_subformat() size = afile.get_size() if description: ret[url]['y'] = description if comment: ret[url]['z'] = comment if subformat: ret[url]['x'] = subformat ret[url]['s'] = str(size) return ret write_message("Synchronizing MARC of recid '%s' with:\n%s" % (rec_id, record), verbose=9) tags856s = record_get_field_instances(record, '856', '%', '%') write_message("Original 856%% instances: %s" % tags856s, verbose=9) tags8564s_to_add = get_bibdocfile_managed_info() write_message("BibDocFile instances: %s" % tags8564s_to_add, verbose=9) positions_tags8564s_to_remove = [] for local_position, field in enumerate(tags856s): if field[1] == '4' and field[2] == ' ': write_message('Analysing %s' % (field, ), verbose=9) for url in field_get_subfield_values(field, 'u') + field_get_subfield_values(field, 'q'): if url in tags8564s_to_add: # there exists a link in the MARC of the record and the connection exists in BibDoc tables if record_had_FFT: merge_bibdocfile_into_marc(field, tags8564s_to_add[url]) else: merge_marc_into_bibdocfile(field, pretend=pretend) del tags8564s_to_add[url] break elif bibdocfile_url_p(url) and decompose_bibdocfile_url(url)[0] == rec_id: # The link exists and is potentially correct-looking link to a document # moreover, it refers to current record id ... but it does not exist in # internal BibDoc structures. This could have happen in the case of renaming a document # or its removal. In both cases we have to remove link... a new one will be created positions_tags8564s_to_remove.append(local_position) write_message("%s to be deleted and re-synchronized" % (field, ), verbose=9) break record_delete_fields(record, '856', positions_tags8564s_to_remove) tags8564s_to_add = tags8564s_to_add.values() tags8564s_to_add.sort() ## FIXME: we are not yet able to preserve the sorting ## of 8564 tags WRT FFT in BibUpload. ## See ticket #1606. for subfields in tags8564s_to_add: subfields = subfields.items() subfields.sort() record_add_field(record, '856', '4', ' ', subfields=subfields) write_message('Final record: %s' % record, verbose=9) return record def _get_subfield_value(field, subfield_code, default=None): res = field_get_subfield_values(field, subfield_code) if res != [] and res != None: return res[0] else: return default def elaborate_mit_tags(record, rec_id, mode, pretend = False, tmp_ids = {}, tmp_vers = {}): """ Uploading MoreInfo -> BDM tags """ tuple_list = extract_tag_from_record(record, 'BDM') # Now gathering information from BDR tags - to be processed later write_message("Processing BDM entries of the record ") recordDocs = BibRecDocs(rec_id) if tuple_list: for mit in record_get_field_instances(record, 'BDM', ' ', ' '): relation_id = _get_subfield_value(mit, "r") bibdoc_id = _get_subfield_value(mit, "i") # checking for a possibly temporary ID if not (bibdoc_id is None): bibdoc_id = resolve_identifier(tmp_ids, bibdoc_id) bibdoc_ver = _get_subfield_value(mit, "v") if not (bibdoc_ver is None): bibdoc_ver = resolve_identifier(tmp_vers, bibdoc_ver) bibdoc_name = _get_subfield_value(mit, "n") bibdoc_fmt = _get_subfield_value(mit, "f") moreinfo_str = _get_subfield_value(mit, "m") if bibdoc_id == None: if bibdoc_name == None: raise StandardError("Incorrect relation. Neither name nor identifier of the first obejct has been specified") else: # retrieving the ID based on the document name (inside current record) # The document is attached to current record. try: bibdoc_id = recordDocs.get_docid(bibdoc_name) except: raise StandardError("BibDoc of a name %s does not exist within a record" % (bibdoc_name, )) else: if bibdoc_name != None: write_message("WARNING: both name and id of the first document of a relation have been specified. Ignoring the name", stream=sys.stderr) if (moreinfo_str is None or mode in ("replace", "correct")) and (not pretend): MoreInfo(docid=bibdoc_id , version = bibdoc_ver, docformat = bibdoc_fmt, relation = relation_id).delete() if (not moreinfo_str is None) and (not pretend): MoreInfo.create_from_serialised(moreinfo_str, docid=bibdoc_id, version = bibdoc_ver, docformat = bibdoc_fmt, relation = relation_id) return record def elaborate_brt_tags(record, rec_id, mode, pretend=False, tmp_ids = {}, tmp_vers = {}): """ Process BDR tags describing relations between existing objects """ tuple_list = extract_tag_from_record(record, 'BDR') # Now gathering information from BDR tags - to be processed later relations_to_create = [] write_message("Processing BDR entries of the record ") recordDocs = BibRecDocs(rec_id) #TODO: check what happens if there is no record yet ! Will the class represent an empty set? if tuple_list: for brt in record_get_field_instances(record, 'BDR', ' ', ' '): relation_id = _get_subfield_value(brt, "r") bibdoc1_id = None bibdoc1_name = None bibdoc1_ver = None bibdoc1_fmt = None bibdoc2_id = None bibdoc2_name = None bibdoc2_ver = None bibdoc2_fmt = None if not relation_id: bibdoc1_id = _get_subfield_value(brt, "i") bibdoc1_name = _get_subfield_value(brt, "n") if bibdoc1_id == None: if bibdoc1_name == None: raise StandardError("Incorrect relation. Neither name nor identifier of the first obejct has been specified") else: # retrieving the ID based on the document name (inside current record) # The document is attached to current record. try: bibdoc1_id = recordDocs.get_docid(bibdoc1_name) except: raise StandardError("BibDoc of a name %s does not exist within a record" % \ (bibdoc1_name, )) else: # resolving temporary identifier bibdoc1_id = resolve_identifier(tmp_ids, bibdoc1_id) if bibdoc1_name != None: write_message("WARNING: both name and id of the first document of a relation have been specified. Ignoring the name", stream=sys.stderr) bibdoc1_ver = _get_subfield_value(brt, "v") if not (bibdoc1_ver is None): bibdoc1_ver = resolve_identifier(tmp_vers, bibdoc1_ver) bibdoc1_fmt = _get_subfield_value(brt, "f") bibdoc2_id = _get_subfield_value(brt, "j") bibdoc2_name = _get_subfield_value(brt, "o") if bibdoc2_id == None: if bibdoc2_name == None: raise StandardError("Incorrect relation. Neither name nor identifier of the second obejct has been specified") else: # retrieving the ID based on the document name (inside current record) # The document is attached to current record. try: bibdoc2_id = recordDocs.get_docid(bibdoc2_name) except: raise StandardError("BibDoc of a name %s does not exist within a record" % (bibdoc2_name, )) else: bibdoc2_id = resolve_identifier(tmp_ids, bibdoc2_id) if bibdoc2_name != None: write_message("WARNING: both name and id of the first document of a relation have been specified. Ignoring the name", stream=sys.stderr) bibdoc2_ver = _get_subfield_value(brt, "w") if not (bibdoc2_ver is None): bibdoc2_ver = resolve_identifier(tmp_vers, bibdoc2_ver) bibdoc2_fmt = _get_subfield_value(brt, "g") control_command = _get_subfield_value(brt, "d") relation_type = _get_subfield_value(brt, "t") if not relation_type and not relation_id: raise StandardError("The relation type must be specified") more_info = _get_subfield_value(brt, "m") # the relation id might be specified in the case of updating # MoreInfo table instead of other fields rel_obj = None if not relation_id: rels = BibRelation.get_relations(rel_type = relation_type, bibdoc1_id = bibdoc1_id, bibdoc2_id = bibdoc2_id, bibdoc1_ver = bibdoc1_ver, bibdoc2_ver = bibdoc2_ver, bibdoc1_fmt = bibdoc1_fmt, bibdoc2_fmt = bibdoc2_fmt) if len(rels) > 0: rel_obj = rels[0] relation_id = rel_obj.id else: rel_obj = BibRelation(rel_id=relation_id) relations_to_create.append((relation_id, bibdoc1_id, bibdoc1_ver, bibdoc1_fmt, bibdoc2_id, bibdoc2_ver, bibdoc2_fmt, relation_type, more_info, rel_obj, control_command)) record_delete_field(record, 'BDR', ' ', ' ') if mode in ("insert", "replace_or_insert", "append", "correct", "replace"): # now creating relations between objects based on the data if not pretend: for (relation_id, bibdoc1_id, bibdoc1_ver, bibdoc1_fmt, bibdoc2_id, bibdoc2_ver, bibdoc2_fmt, rel_type, more_info, rel_obj, control_command) in relations_to_create: if rel_obj == None: rel_obj = BibRelation.create(bibdoc1_id = bibdoc1_id, bibdoc1_ver = bibdoc1_ver, bibdoc1_fmt = bibdoc1_fmt, bibdoc2_id = bibdoc2_id, bibdoc2_ver = bibdoc2_ver, bibdoc2_fmt = bibdoc2_fmt, rel_type = rel_type) relation_id = rel_obj.id if mode in ("replace"): # Clearing existing MoreInfo content rel_obj.get_more_info().delete() if more_info: MoreInfo.create_from_serialised(more_info, relation = relation_id) if control_command == "DELETE": rel_obj.delete() else: write_message("BDR tag is not processed in the %s mode" % (mode, )) return record def elaborate_fft_tags(record, rec_id, mode, pretend=False, tmp_ids = {}, tmp_vers = {}, bibrecdocs=None): """ Process FFT tags that should contain $a with file pathes or URLs to get the fulltext from. This function enriches record with proper 8564 URL tags, downloads fulltext files and stores them into var/data structure where appropriate. CFG_BIBUPLOAD_WGET_SLEEP_TIME defines time to sleep in seconds in between URL downloads. Note: if an FFT tag contains multiple $a subfields, we upload them into different 856 URL tags in the metadata. See regression test case test_multiple_fft_insert_via_http(). """ # Let's define some handy sub procedure. def _add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, modification_date, pretend=False): """Adds a new format for a given bibdoc. Returns True when everything's fine.""" write_message('Add new format to %s url: %s, format: %s, docname: %s, doctype: %s, newname: %s, description: %s, comment: %s, flags: %s, modification_date: %s' % (repr(bibdoc), url, docformat, docname, doctype, newname, description, comment, flags, modification_date), verbose=9) try: if not url: # Not requesting a new url. Just updating comment & description return _update_description_and_comment(bibdoc, docname, docformat, description, comment, flags, pretend=pretend) try: if not pretend: bibdoc.add_file_new_format(url, description=description, comment=comment, flags=flags, modification_date=modification_date) except StandardError, e: write_message("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') not inserted because format already exists (%s)." % (url, docformat, docname, doctype, newname, description, comment, flags, modification_date, e), stream=sys.stderr) raise except Exception, e: write_message("ERROR: in adding '%s' as a new format because of: %s" % (url, e), stream=sys.stderr) raise return True def _add_new_version(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, modification_date, pretend=False): """Adds a new version for a given bibdoc. Returns True when everything's fine.""" write_message('Add new version to %s url: %s, format: %s, docname: %s, doctype: %s, newname: %s, description: %s, comment: %s, flags: %s' % (repr(bibdoc), url, docformat, docname, doctype, newname, description, comment, flags), verbose=9) try: if not url: return _update_description_and_comment(bibdoc, docname, docformat, description, comment, flags, pretend=pretend) try: if not pretend: bibdoc.add_file_new_version(url, description=description, comment=comment, flags=flags, modification_date=modification_date) except StandardError, e: write_message("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') not inserted because '%s'." % (url, docformat, docname, doctype, newname, description, comment, flags, modification_date, e), stream=sys.stderr) raise except Exception, e: write_message("ERROR: in adding '%s' as a new version because of: %s" % (url, e), stream=sys.stderr) raise return True def _update_description_and_comment(bibdoc, docname, docformat, description, comment, flags, pretend=False): """Directly update comments and descriptions.""" write_message('Just updating description and comment for %s with format %s with description %s, comment %s and flags %s' % (docname, docformat, description, comment, flags), verbose=9) try: if not pretend: bibdoc.set_description(description, docformat) bibdoc.set_comment(comment, docformat) for flag in CFG_BIBDOCFILE_AVAILABLE_FLAGS: if flag in flags: bibdoc.set_flag(flag, docformat) else: bibdoc.unset_flag(flag, docformat) except StandardError, e: write_message("('%s', '%s', '%s', '%s', '%s') description and comment not updated because '%s'." % (docname, docformat, description, comment, flags, e)) raise return True def _process_document_moreinfos(more_infos, docname, version, docformat, mode): if not mode in ('correct', 'append', 'replace_or_insert', 'replace', 'insert'): #print "exited because the mode is incorrect" return docid = None try: docid = bibrecdocs.get_docid(docname) except: raise StandardError("MoreInfo: No document of a given name associated with the record") if not version: # We have to retrieve the most recent version ... version = bibrecdocs.get_bibdoc(docname).get_latest_version() doc_moreinfo_s, version_moreinfo_s, version_format_moreinfo_s, format_moreinfo_s = more_infos if mode in ("replace", "replace_or_insert"): if doc_moreinfo_s: #only if specified, otherwise do not touch MoreInfo(docid = docid).delete() if format_moreinfo_s: #only if specified... otherwise do not touch MoreInfo(docid = docid, docformat = docformat).delete() if not doc_moreinfo_s is None: MoreInfo.create_from_serialised(ser_str = doc_moreinfo_s, docid = docid) if not version_moreinfo_s is None: MoreInfo.create_from_serialised(ser_str = version_moreinfo_s, docid = docid, version = version) if not version_format_moreinfo_s is None: MoreInfo.create_from_serialised(ser_str = version_format_moreinfo_s, docid = docid, version = version, docformat = docformat) if not format_moreinfo_s is None: MoreInfo.create_from_serialised(ser_str = format_moreinfo_s, docid = docid, docformat = docformat) if mode == 'delete': raise StandardError('FFT tag specified but bibupload executed in --delete mode') tuple_list = extract_tag_from_record(record, 'FFT') if tuple_list: # FFT Tags analysis write_message("FFTs: "+str(tuple_list), verbose=9) docs = {} # docnames and their data for fft in record_get_field_instances(record, 'FFT', ' ', ' '): # Very first, we retrieve the potentially temporary odentifiers... #even if the rest fails, we should include them in teh dictionary version = _get_subfield_value(fft, 'v', '') # checking if version is temporary... if so, filling a different varaible is_tmp_ver, bibdoc_tmpver = parse_identifier(version) if is_tmp_ver: version = None else: bibdoc_tmpver = None if not version: #treating cases of empty string etc... version = None bibdoc_tmpid = field_get_subfield_values(fft, 'i') if bibdoc_tmpid: bibdoc_tmpid = bibdoc_tmpid[0] else: bibdoc_tmpid is_tmp_id, bibdoc_tmpid = parse_identifier(bibdoc_tmpid) if not is_tmp_id: bibdoc_tmpid = None # In the case of having temporary id's, we dont resolve them yet but signaklise that they have been used # value -1 means that identifier has been declared but not assigned a value yet if bibdoc_tmpid: if bibdoc_tmpid in tmp_ids: write_message("WARNING: the temporary identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpid, ), stream=sys.stderr) else: tmp_ids[bibdoc_tmpid] = -1 if bibdoc_tmpver: if bibdoc_tmpver in tmp_vers: write_message("WARNING: the temporary version identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpver, ), stream=sys.stderr) else: tmp_vers[bibdoc_tmpver] = -1 # Let's discover the type of the document # This is a legacy field and will not be enforced any particular # check on it. doctype = _get_subfield_value(fft, 't', 'Main') #Default is Main # Let's discover the url. url = field_get_subfield_values(fft, 'a') if url: url = url[0] try: check_valid_url(url) except StandardError, e: raise StandardError, "fft '%s' specifies in $a a location ('%s') with problems: %s" % (fft, url, e) else: url = '' #TODO: a lot of code can be compactified using similar syntax ... should be more readable on the longer scale # maybe right side expressions look a bit cryptic, but the elaborate_fft function would be much clearer if mode == 'correct' and doctype != 'FIX-MARC': arg2 = "" else: arg2 = KEEP_OLD_VALUE description = _get_subfield_value(fft, 'd', arg2) # Let's discover the description # description = field_get_subfield_values(fft, 'd') # if description != []: # description = description[0] # else: # if mode == 'correct' and doctype != 'FIX-MARC': ## If the user require to correct, and do not specify ## a description this means she really want to ## modify the description. # description = '' # else: # description = KEEP_OLD_VALUE # Let's discover the desired docname to be created/altered name = field_get_subfield_values(fft, 'n') if name: ## Let's remove undesired extensions name = file_strip_ext(name[0] + '.pdf') else: if url: name = get_docname_from_url(url) elif mode != 'correct' and doctype != 'FIX-MARC': raise StandardError, "WARNING: fft '%s' doesn't specifies either a location in $a or a docname in $n" % str(fft) else: continue # Let's discover the desired new docname in case we want to change it newname = field_get_subfield_values(fft, 'm') if newname: newname = file_strip_ext(newname[0] + '.pdf') else: newname = name # Let's discover the desired format docformat = field_get_subfield_values(fft, 'f') if docformat: docformat = normalize_format(docformat[0]) else: if url: docformat = guess_format_from_url(url) else: docformat = "" # Let's discover the icon icon = field_get_subfield_values(fft, 'x') if icon != []: icon = icon[0] if icon != KEEP_OLD_VALUE: try: check_valid_url(icon) except StandardError, e: raise StandardError, "fft '%s' specifies in $x an icon ('%s') with problems: %s" % (fft, icon, e) else: icon = '' # Let's discover the comment comment = field_get_subfield_values(fft, 'z') if comment != []: comment = comment[0] else: if mode == 'correct' and doctype != 'FIX-MARC': ## See comment on description comment = '' else: comment = KEEP_OLD_VALUE # Let's discover the restriction restriction = field_get_subfield_values(fft, 'r') if restriction != []: restriction = restriction[0] else: if mode == 'correct' and doctype != 'FIX-MARC': ## See comment on description restriction = '' else: restriction = KEEP_OLD_VALUE document_moreinfo = _get_subfield_value(fft, 'w') version_moreinfo = _get_subfield_value(fft, 'p') version_format_moreinfo = _get_subfield_value(fft, 'b') format_moreinfo = _get_subfield_value(fft, 'u') # Let's discover the timestamp of the file (if any) timestamp = field_get_subfield_values(fft, 's') if timestamp: try: timestamp = datetime(*(time.strptime(timestamp[0], "%Y-%m-%d %H:%M:%S")[:6])) except ValueError: write_message('WARNING: The timestamp is not in a good format, thus will be ignored. The format should be YYYY-MM-DD HH:MM:SS', stream=sys.stderr) timestamp = '' else: timestamp = '' flags = field_get_subfield_values(fft, 'o') for flag in flags: if flag not in CFG_BIBDOCFILE_AVAILABLE_FLAGS: raise StandardError, "fft '%s' specifies a non available flag: %s" % (fft, flag) if docs.has_key(name): # new format considered (doctype2, newname2, restriction2, version2, urls, dummybibdoc_moreinfos2, dummybibdoc_tmpid2, dummybibdoc_tmpver2 ) = docs[name] if doctype2 != doctype: raise StandardError, "fft '%s' specifies a different doctype from previous fft with docname '%s'" % (str(fft), name) if newname2 != newname: raise StandardError, "fft '%s' specifies a different newname from previous fft with docname '%s'" % (str(fft), name) if restriction2 != restriction: raise StandardError, "fft '%s' specifies a different restriction from previous fft with docname '%s'" % (str(fft), name) if version2 != version: raise StandardError, "fft '%s' specifies a different version than the previous fft with docname '%s'" % (str(fft), name) for (dummyurl2, format2, dummydescription2, dummycomment2, dummyflags2, dummytimestamp2) in urls: if docformat == format2: raise StandardError, "fft '%s' specifies a second file '%s' with the same format '%s' from previous fft with docname '%s'" % (str(fft), url, docformat, name) if url or docformat: urls.append((url, docformat, description, comment, flags, timestamp)) if icon: urls.append((icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags, timestamp)) else: if url or docformat: docs[name] = (doctype, newname, restriction, version, [(url, docformat, description, comment, flags, timestamp)], [document_moreinfo, version_moreinfo, version_format_moreinfo, format_moreinfo], bibdoc_tmpid, bibdoc_tmpver) if icon: docs[name][4].append((icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags, timestamp)) elif icon: docs[name] = (doctype, newname, restriction, version, [(icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags, timestamp)], [document_moreinfo, version_moreinfo, version_format_moreinfo, format_moreinfo], bibdoc_tmpid, bibdoc_tmpver) else: docs[name] = (doctype, newname, restriction, version, [], [document_moreinfo, version_moreinfo, version_format_moreinfo, format_moreinfo], bibdoc_tmpid, bibdoc_tmpver) write_message('Result of FFT analysis:\n\tDocs: %s' % (docs,), verbose=9) # Let's remove all FFT tags record_delete_field(record, 'FFT', ' ', ' ') ## Let's pre-download all the URLs to see if, in case of mode 'correct' or 'append' ## we can avoid creating a new revision. for docname, (doctype, newname, restriction, version, urls, more_infos, bibdoc_tmpid, bibdoc_tmpver ) in docs.items(): downloaded_urls = [] try: bibdoc = bibrecdocs.get_bibdoc(docname) except InvenioBibDocFileError: ## A bibdoc with the given docname does not exists. ## So there is no chance we are going to revise an existing ## format with an identical file :-) bibdoc = None new_revision_needed = False for url, docformat, description, comment, flags, timestamp in urls: if url: try: downloaded_url = download_url(url, docformat) write_message("%s saved into %s" % (url, downloaded_url), verbose=9) except Exception, err: write_message("ERROR: in downloading '%s' because of: %s" % (url, err), stream=sys.stderr) raise if mode == 'correct' and bibdoc is not None and not new_revision_needed: downloaded_urls.append((downloaded_url, docformat, description, comment, flags, timestamp)) if not bibrecdocs.check_file_exists(downloaded_url, docformat): new_revision_needed = True else: write_message("WARNING: %s is already attached to bibdoc %s for recid %s" % (url, docname, rec_id), stream=sys.stderr) elif mode == 'append' and bibdoc is not None: if not bibrecdocs.check_file_exists(downloaded_url, docformat): downloaded_urls.append((downloaded_url, docformat, description, comment, flags, timestamp)) else: write_message("WARNING: %s is already attached to bibdoc %s for recid %s" % (url, docname, rec_id), stream=sys.stderr) else: downloaded_urls.append((downloaded_url, docformat, description, comment, flags, timestamp)) else: downloaded_urls.append(('', docformat, description, comment, flags, timestamp)) if mode == 'correct' and bibdoc is not None and not new_revision_needed: ## Since we don't need a new revision (because all the files ## that are being uploaded are different) ## we can simply remove the urls but keep the other information write_message("No need to add a new revision for docname %s for recid %s" % (docname, rec_id), verbose=2) docs[docname] = (doctype, newname, restriction, version, [('', docformat, description, comment, flags, timestamp) for (dummy, docformat, description, comment, flags, timestamp) in downloaded_urls], more_infos, bibdoc_tmpid, bibdoc_tmpver) for downloaded_url, dummy, dummy, dummy, dummy, dummy in downloaded_urls: ## Let's free up some space :-) if downloaded_url and os.path.exists(downloaded_url): os.remove(downloaded_url) else: if downloaded_urls or mode != 'append': docs[docname] = (doctype, newname, restriction, version, downloaded_urls, more_infos, bibdoc_tmpid, bibdoc_tmpver) else: ## In case we are in append mode and there are no urls to append ## we discard the whole FFT del docs[docname] if mode == 'replace': # First we erase previous bibdocs if not pretend: for bibdoc in bibrecdocs.list_bibdocs(): bibdoc.delete() bibrecdocs.dirty = True for docname, (doctype, newname, restriction, version, urls, more_infos, bibdoc_tmpid, bibdoc_tmpver) in docs.iteritems(): write_message("Elaborating olddocname: '%s', newdocname: '%s', doctype: '%s', restriction: '%s', urls: '%s', mode: '%s'" % (docname, newname, doctype, restriction, urls, mode), verbose=9) if mode in ('insert', 'replace'): # new bibdocs, new docnames, new marc if newname in bibrecdocs.get_bibdoc_names(): write_message("('%s', '%s') not inserted because docname already exists." % (newname, urls), stream=sys.stderr) raise StandardError("('%s', '%s') not inserted because docname already exists." % (newname, urls), stream=sys.stderr) try: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, newname) bibdoc.set_status(restriction) else: bibdoc = None except Exception, e: write_message("('%s', '%s', '%s') not inserted because: '%s'." % (doctype, newname, urls, e), stream=sys.stderr) raise e for (url, docformat, description, comment, flags, timestamp) in urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend)) elif mode == 'replace_or_insert': # to be thought as correct_or_insert try: bibdoc = bibrecdocs.get_bibdoc(docname) found_bibdoc = True except InvenioBibDocFileError: found_bibdoc = False else: if doctype not in ('PURGE', 'DELETE', 'EXPUNGE', 'REVERT', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE'): if newname != docname: try: if not pretend: bibrecdocs.change_name(newname=newname, docid=bibdoc.id) write_message(lambda: "After renaming: %s" % bibrecdocs, verbose=9) except StandardError, e: write_message('ERROR: in renaming %s to %s: %s' % (docname, newname, e), stream=sys.stderr) raise try: bibdoc = bibrecdocs.get_bibdoc(newname) found_bibdoc = True except InvenioBibDocFileError: found_bibdoc = False else: if doctype == 'PURGE': if not pretend: bibdoc.purge() bibrecdocs.dirty = True elif doctype == 'DELETE': if not pretend: bibdoc.delete() bibrecdocs.dirty = True elif doctype == 'EXPUNGE': if not pretend: bibdoc.expunge() bibrecdocs.dirty = True elif doctype == 'FIX-ALL': if not pretend: bibrecdocs.fix(docname) elif doctype == 'FIX-MARC': pass elif doctype == 'DELETE-FILE': if urls: for (url, docformat, description, comment, flags, timestamp) in urls: if not pretend: bibdoc.delete_file(docformat, version) elif doctype == 'REVERT': try: if not pretend: bibdoc.revert(version) except Exception, e: write_message('(%s, %s) not correctly reverted: %s' % (newname, version, e), stream=sys.stderr) raise else: if restriction != KEEP_OLD_VALUE: if not pretend: bibdoc.set_status(restriction) # Since the docname already existed we have to first # bump the version by pushing the first new file # then pushing the other files. if urls: (first_url, first_format, first_description, first_comment, first_flags, first_timestamp) = urls[0] other_urls = urls[1:] assert(_add_new_version(bibdoc, first_url, first_format, docname, doctype, newname, first_description, first_comment, first_flags, first_timestamp, pretend=pretend)) for (url, docformat, description, comment, flags, timestamp) in other_urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend)) ## Let's refresh the list of bibdocs. if not found_bibdoc: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, newname) bibdoc.set_status(restriction) for (url, docformat, description, comment, flags, timestamp) in urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp)) elif mode == 'correct': try: bibdoc = bibrecdocs.get_bibdoc(docname) found_bibdoc = True except InvenioBibDocFileError: found_bibdoc = False else: if doctype not in ('PURGE', 'DELETE', 'EXPUNGE', 'REVERT', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE'): if newname != docname: try: if not pretend: bibrecdocs.change_name(newname=newname, docid=bibdoc.id) write_message(lambda: "After renaming: %s" % bibrecdocs, verbose=9) except StandardError, e: write_message('ERROR: in renaming %s to %s: %s' % (docname, newname, e), stream=sys.stderr) raise try: bibdoc = bibrecdocs.get_bibdoc(newname) found_bibdoc = True except InvenioBibDocFileError: found_bibdoc = False else: if doctype == 'PURGE': if not pretend: bibdoc.purge() bibrecdocs.dirty = True elif doctype == 'DELETE': if not pretend: bibdoc.delete() bibrecdocs.dirty = True elif doctype == 'EXPUNGE': if not pretend: bibdoc.expunge() bibrecdocs.dirty = True elif doctype == 'FIX-ALL': if not pretend: bibrecdocs.fix(newname) elif doctype == 'FIX-MARC': pass elif doctype == 'DELETE-FILE': if urls: for (url, docformat, description, comment, flags, timestamp) in urls: if not pretend: bibdoc.delete_file(docformat, version) elif doctype == 'REVERT': try: if not pretend: bibdoc.revert(version) except Exception, e: write_message('(%s, %s) not correctly reverted: %s' % (newname, version, e), stream=sys.stderr) raise else: if restriction != KEEP_OLD_VALUE: if not pretend: bibdoc.set_status(restriction) if doctype and doctype != KEEP_OLD_VALUE: if not pretend: bibdoc.change_doctype(doctype) if urls: (first_url, first_format, first_description, first_comment, first_flags, first_timestamp) = urls[0] other_urls = urls[1:] assert(_add_new_version(bibdoc, first_url, first_format, docname, doctype, newname, first_description, first_comment, first_flags, first_timestamp, pretend=pretend)) for (url, docformat, description, comment, flags, timestamp) in other_urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend)) if not found_bibdoc: if doctype in ('PURGE', 'DELETE', 'EXPUNGE', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE', 'REVERT'): write_message("('%s', '%s', '%s') not performed because '%s' docname didn't existed." % (doctype, newname, urls, docname), stream=sys.stderr) raise StandardError else: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, newname) bibdoc.set_status(restriction) for (url, docformat, description, comment, flags, timestamp) in urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp)) elif mode == 'append': found_bibdoc = False try: bibdoc = bibrecdocs.get_bibdoc(docname) found_bibdoc = True except InvenioBibDocFileError: found_bibdoc = False else: for (url, docformat, description, comment, flags, timestamp) in urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend)) if not found_bibdoc: try: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, docname) bibdoc.set_status(restriction) for (url, docformat, description, comment, flags, timestamp) in urls: assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp)) except Exception, e: register_exception() write_message("('%s', '%s', '%s') not appended because: '%s'." % (doctype, newname, urls, e), stream=sys.stderr) raise if not pretend and doctype not in ('PURGE', 'DELETE', 'EXPUNGE'): _process_document_moreinfos(more_infos, newname, version, urls and urls[0][1], mode) # resolving temporary version and identifier if bibdoc_tmpid: if bibdoc_tmpid in tmp_ids and tmp_ids[bibdoc_tmpid] != -1: write_message("WARNING: the temporary identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpid, ), stream=sys.stderr) else: tmp_ids[bibdoc_tmpid] = bibrecdocs.get_docid(docname) if bibdoc_tmpver: if bibdoc_tmpver in tmp_vers and tmp_vers[bibdoc_tmpver] != -1: write_message("WARNING: the temporary version identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpver, ), stream=sys.stderr) else: if version == None: if version: tmp_vers[bibdoc_tmpver] = version else: tmp_vers[bibdoc_tmpver] = bibrecdocs.get_bibdoc(docname).get_latest_version() else: tmp_vers[bibdoc_tmpver] = version return record ### Update functions def update_bibrec_date(record_modification_date, bibrec_id, insert_mode_p, record_creation_date=None, pretend=False): """ Update the date of the record in bibrec table. Note: record_creation_date is mandatory if insert_mode_p=True. """ if insert_mode_p: query = """UPDATE bibrec SET creation_date=%s, modification_date=%s WHERE id=%s""" params = (record_creation_date, record_modification_date, bibrec_id) else: query = """UPDATE bibrec SET modification_date=%s WHERE id=%s""" params = (record_modification_date, bibrec_id) if not pretend: run_sql(query, params) write_message(" -Update record creation/modification date: DONE" , verbose=2) def update_bibfmt_format(id_bibrec, format_value, format_name, modification_date=None, pretend=False): """Update the format in the table bibfmt""" if modification_date is None: modification_date = time.strftime('%Y-%m-%d %H:%M:%S') else: try: time.strptime(modification_date, "%Y-%m-%d %H:%M:%S") except ValueError: modification_date = '1970-01-01 00:00:00' # We check if the format is already in bibFmt nb_found = find_record_format(id_bibrec, format_name) if nb_found == 1: # we are going to update the format # compress the format_value value pickled_format_value = compress(format_value) # update the format: - query = """UPDATE LOW_PRIORITY bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s""" + query = """UPDATE LOW_PRIORITY bibfmt SET last_updated=%s, value=_binary %s WHERE id_bibrec=%s AND format=%s""" params = (modification_date, pickled_format_value, id_bibrec, format_name) if not pretend: row_id = run_sql(query, params) if not pretend and row_id is None: write_message(" ERROR: during update_bibfmt_format function", verbose=1, stream=sys.stderr) return 1 else: write_message(" -Update the format %s in bibfmt: DONE" % format_name , verbose=2) return 0 elif nb_found > 1: write_message(" Failed: Same format %s found several time in bibfmt for the same record." % format_name, verbose=1, stream=sys.stderr) return 1 else: # Insert the format information in BibFMT res = insert_bibfmt(id_bibrec, format_value, format_name, modification_date, pretend=pretend) if res is None: write_message(" ERROR: during insert_bibfmt", verbose=1, stream=sys.stderr) return 1 else: write_message(" -Insert the format %s in bibfmt: DONE" % format_name , verbose=2) return 0 def delete_bibfmt_format(id_bibrec, format_name, pretend=False): """ Delete format FORMAT_NAME from bibfmt table fo record ID_BIBREC. """ if not pretend: run_sql("DELETE LOW_PRIORITY FROM bibfmt WHERE id_bibrec=%s and format=%s", (id_bibrec, format_name)) return 0 def archive_marcxml_for_history(recID, affected_fields, pretend=False): """ Archive current MARCXML format of record RECID from BIBFMT table into hstRECORD table. Useful to keep MARCXML history of records. Return 0 if everything went fine. Return 1 otherwise. """ res = run_sql("SELECT id_bibrec, value, last_updated FROM bibfmt WHERE format='xm' AND id_bibrec=%s", (recID,)) db_affected_fields = "" if affected_fields: tmp_affected_fields = {} for field in affected_fields: if field.isdigit(): #hack for tags from RevisionVerifier for ind in affected_fields[field]: tmp_affected_fields[(field + ind[0] + ind[1] + "%").replace(" ", "_")] = 1 else: pass #future implementation for fields tmp_affected_fields = tmp_affected_fields.keys() tmp_affected_fields.sort() db_affected_fields = ",".join(tmp_affected_fields) if res and not pretend: run_sql("""INSERT INTO hstRECORD (id_bibrec, marcxml, job_id, job_name, job_person, job_date, job_details, affected_fields) - VALUES (%s,%s,%s,%s,%s,%s,%s,%s)""", + VALUES (%s,_binary %s,%s,%s,%s,%s,_binary %s,%s)""", (res[0][0], res[0][1], task_get_task_param('task_id', 0), 'bibupload', task_get_task_param('user', 'UNKNOWN'), res[0][2], 'mode: ' + task_get_option('mode', 'UNKNOWN') + '; file: ' + task_get_option('file_path', 'UNKNOWN') + '.', db_affected_fields)) return 0 def update_database_with_metadata(record, rec_id, oai_rec_id="oai", affected_tags=None, pretend=False): """Update the database tables with the record and the record id given in parameter""" # extract only those tags that have been affected. # check happens at subfield level. This is to prevent overhead # associated with inserting already existing field with given ind pair write_message("update_database_with_metadata: record=%s, rec_id=%s, oai_rec_id=%s, affected_tags=%s" % (record, rec_id, oai_rec_id, affected_tags), verbose=9) tmp_record = {} if affected_tags: for tag in record.keys(): if tag in affected_tags.keys(): write_message(" -Tag %s found to be modified.Setting up for update" % tag, verbose=9) # initialize new list to hold affected field new_data_tuple_list = [] for data_tuple in record[tag]: ind1 = data_tuple[1] ind2 = data_tuple[2] if (ind1, ind2) in affected_tags[tag]: write_message(" -Indicator pair (%s, %s) added to update list" % (ind1, ind2), verbose=9) new_data_tuple_list.append(data_tuple) tmp_record[tag] = new_data_tuple_list write_message(lambda: " -Modified fields: \n%s" % record_xml_output(tmp_record), verbose=2) else: tmp_record = record for tag in tmp_record.keys(): # check if tag is not a special one: if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS: # for each tag there is a list of tuples representing datafields tuple_list = tmp_record[tag] # this list should contain the elements of a full tag [tag, ind1, ind2, subfield_code] tag_list = [] tag_list.append(tag) for single_tuple in tuple_list: # these are the contents of a single tuple subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # append the ind's to the full tag if ind1 == '' or ind1 == ' ': tag_list.append('_') else: tag_list.append(ind1) if ind2 == '' or ind2 == ' ': tag_list.append('_') else: tag_list.append(ind2) datafield_number = single_tuple[4] if tag in CFG_BIBUPLOAD_SPECIAL_TAGS: # nothing to do for special tags (FFT, BDR, BDM) pass elif tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS and tag != "001": value = single_tuple[3] # get the full tag full_tag = ''.join(tag_list) # update the tables write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9) # insert the tag and value into into bibxxx (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value, pretend=pretend) #print 'tname, bibrow', table_name, bibxxx_row_id; if table_name is None or bibxxx_row_id is None: write_message(" Failed: during insert_record_bibxxx", verbose=1, stream=sys.stderr) # connect bibxxx and bibrec with the table bibrec_bibxxx res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id, pretend=pretend) if res is None: write_message(" Failed: during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr) else: # get the tag and value from the content of each subfield for subfield in subfield_list: subtag = subfield[0] value = subfield[1] tag_list.append(subtag) # get the full tag full_tag = ''.join(tag_list) # update the tables write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9) # insert the tag and value into into bibxxx (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value, pretend=pretend) if table_name is None or bibxxx_row_id is None: write_message(" Failed: during insert_record_bibxxx", verbose=1, stream=sys.stderr) # connect bibxxx and bibrec with the table bibrec_bibxxx res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id, pretend=pretend) if res is None: write_message(" Failed: during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr) # remove the subtag from the list tag_list.pop() tag_list.pop() tag_list.pop() tag_list.pop() write_message(" -Update the database with metadata: DONE", verbose=2) log_record_uploading(oai_rec_id, task_get_task_param('task_id', 0), rec_id, 'P', pretend=pretend) def append_new_tag_to_old_record(record, rec_old): """Append new tags to a old record""" def _append_tag(tag): if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS: if tag == '001': pass else: # if it is a controlfield, just access the value for single_tuple in record[tag]: controlfield_value = single_tuple[3] # add the field to the old record newfield_number = record_add_field(rec_old, tag, controlfield_value=controlfield_value) if newfield_number is None: write_message(" ERROR: when adding the field"+tag, verbose=1, stream=sys.stderr) else: # For each tag there is a list of tuples representing datafields for single_tuple in record[tag]: # We retrieve the information of the tag subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] if '%s%s%s' % (tag, ind1 == ' ' and '_' or ind1, ind2 == ' ' and '_' or ind2) in (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:5], CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[:5]): ## We don't want to append the external identifier ## if it is already existing. if record_find_field(rec_old, tag, single_tuple)[0] is not None: write_message(" Not adding tag: %s ind1=%s ind2=%s subfields=%s: it's already there" % (tag, ind1, ind2, subfield_list), verbose=9) continue # We add the datafield to the old record write_message(" Adding tag: %s ind1=%s ind2=%s subfields=%s" % (tag, ind1, ind2, subfield_list), verbose=9) newfield_number = record_add_field(rec_old, tag, ind1, ind2, subfields=subfield_list) if newfield_number is None: write_message(" ERROR: when adding the field"+tag, verbose=1, stream=sys.stderr) # Go through each tag in the appended record for tag in record: _append_tag(tag) return rec_old def copy_strong_tags_from_old_record(record, rec_old): """ Look for strong tags in RECORD and REC_OLD. If no strong tags are found in RECORD, then copy them over from REC_OLD. This function modifies RECORD structure on the spot. """ for strong_tag in CFG_BIBUPLOAD_STRONG_TAGS: if not record_get_field_instances(record, strong_tag, strong_tag[3:4] or '%', strong_tag[4:5] or '%'): strong_tag_old_field_instances = record_get_field_instances(rec_old, strong_tag) if strong_tag_old_field_instances: for strong_tag_old_field_instance in strong_tag_old_field_instances: sf_vals, fi_ind1, fi_ind2, controlfield, dummy = strong_tag_old_field_instance record_add_field(record, strong_tag, fi_ind1, fi_ind2, controlfield, sf_vals) return ### Delete functions def delete_tags(record, rec_old): """ Returns a record structure with all the fields in rec_old minus the fields in record. @param record: The record containing tags to delete. @type record: record structure @param rec_old: The original record. @type rec_old: record structure @return: The modified record. @rtype: record structure """ returned_record = copy.deepcopy(rec_old) for tag, fields in record.iteritems(): if tag in ('001', ): continue for field in fields: local_position = record_find_field(returned_record, tag, field)[1] if local_position is not None: record_delete_field(returned_record, tag, field_position_local=local_position) return returned_record def delete_tags_to_correct(record, rec_old): """ Delete tags from REC_OLD which are also existing in RECORD. When deleting, pay attention not only to tags, but also to indicators, so that fields with the same tags but different indicators are not deleted. """ ## Some fields are controlled via provenance information. ## We should re-add saved fields at the end. fields_to_readd = {} for tag in CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS: if tag[:3] in record: tmp_field_instances = record_get_field_instances(record, tag[:3], tag[3], tag[4]) ## Let's discover the provenance that will be updated provenances_to_update = [] for instance in tmp_field_instances: for code, value in instance[0]: if code == tag[5]: if value not in provenances_to_update: provenances_to_update.append(value) break else: ## The provenance is not specified. ## let's add the special empty provenance. if '' not in provenances_to_update: provenances_to_update.append('') potential_fields_to_readd = record_get_field_instances(rec_old, tag[:3], tag[3], tag[4]) ## Let's take all the field corresponding to tag ## Let's save apart all the fields that should be updated, but ## since they have a different provenance not mentioned in record ## they should be preserved. fields = [] for sf_vals, ind1, ind2, dummy_cf, dummy_line in potential_fields_to_readd: for code, value in sf_vals: if code == tag[5]: if value not in provenances_to_update: fields.append(sf_vals) break else: if '' not in provenances_to_update: ## Empty provenance, let's protect in any case fields.append(sf_vals) fields_to_readd[tag] = fields # browse through all the tags from the MARCXML file: for tag in record: # check if the tag exists in the old record too: if tag in rec_old and tag != '001': # the tag does exist, so delete all record's tag+ind1+ind2 combinations from rec_old for dummy_sf_vals, ind1, ind2, dummy_cf, dummyfield_number in record[tag]: write_message(" Delete tag: " + tag + " ind1=" + ind1 + " ind2=" + ind2, verbose=9) record_delete_field(rec_old, tag, ind1, ind2) ## Ok, we readd necessary fields! for tag, fields in fields_to_readd.iteritems(): for sf_vals in fields: write_message(" Adding tag: " + tag[:3] + " ind1=" + tag[3] + " ind2=" + tag[4] + " code=" + str(sf_vals), verbose=9) record_add_field(rec_old, tag[:3], tag[3], tag[4], subfields=sf_vals) def delete_bibrec_bibxxx(record, id_bibrec, affected_tags={}, pretend=False): """Delete the database record from the table bibxxx given in parameters""" # we clear all the rows from bibrec_bibxxx from the old record # clearing only those tags that have been modified. write_message(lambda: "delete_bibrec_bibxxx(record=%s, id_bibrec=%s, affected_tags=%s)" % (record, id_bibrec, affected_tags), verbose=9) for tag in affected_tags: # sanity check with record keys just to make sure its fine. if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS: write_message("%s found in record"%tag, verbose=2) # for each name construct the bibrec_bibxxx table name table_name = 'bib'+tag[0:2]+'x' bibrec_table = 'bibrec_'+table_name # delete all the records with proper id_bibrec. Indicators matter for individual affected tags tmp_ind_1 = '' tmp_ind_2 = '' # construct exact tag value using indicators for ind_pair in affected_tags[tag]: if ind_pair[0] == ' ': tmp_ind_1 = '_' else: tmp_ind_1 = ind_pair[0] if ind_pair[1] == ' ': tmp_ind_2 = '_' else: tmp_ind_2 = ind_pair[1] # need to escape incase of underscore so that mysql treats it as a char tag_val = tag+"\\"+tmp_ind_1+"\\"+tmp_ind_2 + '%' query = """DELETE br.* FROM `%s` br,`%s` b where br.id_bibrec=%%s and br.id_bibxxx=b.id and b.tag like %%s""" % (bibrec_table, table_name) params = (id_bibrec, tag_val) write_message(query % params, verbose=9) if not pretend: run_sql(query, params) else: write_message("%s not found"%tag, verbose=2) def main(): """Main that construct all the bibtask.""" task_init(authorization_action='runbibupload', authorization_msg="BibUpload Task Submission", description="""Receive MARC XML file and update appropriate database tables according to options. Examples: $ bibupload -i input.xml """, help_specific_usage=""" -a, --append\t\tnew fields are appended to the existing record -c, --correct\t\tfields are replaced by the new ones in the existing record, except \t\t\twhen overridden by CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS -i, --insert\t\tinsert the new record in the database -r, --replace\t\tthe existing record is entirely replaced by the new one, \t\t\texcept for fields in CFG_BIBUPLOAD_STRONG_TAGS -d, --delete\t\tspecified fields are deleted in existing record -n, --notimechange\tdo not change record last modification date when updating -o, --holdingpen\tInsert record into holding pen instead of the normal database --pretend\t\tdo not really insert/append/correct/replace the input file --force\t\twhen --replace, use provided 001 tag values, even if the matching \t\t\trecord does not exist (thus allocating it on-the-fly) --callback-url\tSend via a POST request a JSON-serialized answer (see admin guide), in \t\t\torder to provide a feedback to an external service about the outcome of the operation. --nonce\t\twhen used together with --callback add the nonce value in the JSON message. --special-treatment=MODE\tif "oracle" is specified, when used together with --callback_url, \t\t\tPOST an application/x-www-form-urlencoded request where the JSON message is encoded \t\t\tinside a form field called "results". """, version=__revision__, specific_params=("ircazdnoS:", [ "insert", "replace", "correct", "append", "reference", "delete", "notimechange", "holdingpen", "pretend", "force", "callback-url=", "nonce=", "special-treatment=", "stage=", ]), task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter, task_run_fnc=task_run_core, task_submit_check_options_fnc=task_submit_check_options) def task_submit_elaborate_specific_parameter(key, value, opts, args): # pylint: disable=W0613 """ Given the string key it checks it's meaning, eventually using the value. Usually it fills some key in the options dict. It must return True if it has elaborated the key, False, if it doesn't know that key. eg: if key in ['-n', '--number']: task_get_option(\1) = value return True return False """ # No time change option if key in ("-n", "--notimechange"): task_set_option('notimechange', 1) # Insert mode option elif key in ("-i", "--insert"): if task_get_option('mode') == 'replace': # if also replace found, then set to replace_or_insert task_set_option('mode', 'replace_or_insert') else: task_set_option('mode', 'insert') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Replace mode option elif key in ("-r", "--replace"): if task_get_option('mode') == 'insert': # if also insert found, then set to replace_or_insert task_set_option('mode', 'replace_or_insert') else: task_set_option('mode', 'replace') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Holding pen mode option elif key in ("-o", "--holdingpen"): write_message("Holding pen mode", verbose=3) task_set_option('mode', 'holdingpen') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Correct mode option elif key in ("-c", "--correct"): task_set_option('mode', 'correct') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Append mode option elif key in ("-a", "--append"): task_set_option('mode', 'append') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Deprecated reference mode option (now correct) elif key in ("-z", "--reference"): task_set_option('mode', 'correct') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("-d", "--delete"): task_set_option('mode', 'delete') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("--pretend",): task_set_option('pretend', True) fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("--force",): task_set_option('force', True) fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("--callback-url", ): task_set_option('callback_url', value) elif key in ("--nonce", ): task_set_option('nonce', value) elif key in ("--special-treatment", ): if value.lower() in CFG_BIBUPLOAD_ALLOWED_SPECIAL_TREATMENTS: if value.lower() == 'oracle': task_set_option('oracle_friendly', True) else: print >> sys.stderr, """The specified value is not in the list of allowed special treatments codes: %s""" % CFG_BIBUPLOAD_ALLOWED_SPECIAL_TREATMENTS return False elif key in ("-S", "--stage"): print >> sys.stderr, """WARNING: the --stage parameter is deprecated and ignored.""" else: return False return True def task_submit_check_options(): """ Reimplement this method for having the possibility to check options before submitting the task, in order for example to provide default values. It must return False if there are errors in the options. """ if task_get_option('mode') is None: write_message("Please specify at least one update/insert mode!", stream=sys.stderr) return False file_path = task_get_option('file_path') if file_path is None: write_message("Missing filename! -h for help.", stream=sys.stderr) return False try: open(file_path).read().decode('utf-8') except IOError: write_message("""File is not accessible: %s""" % file_path, stream=sys.stderr) return False except UnicodeDecodeError: write_message("""File encoding is not valid utf-8: %s""" % file_path, stream=sys.stderr) return False return True def writing_rights_p(): """Return True in case bibupload has the proper rights to write in the fulltext file folder.""" if _WRITING_RIGHTS is not None: return _WRITING_RIGHTS try: if not os.path.exists(CFG_BIBDOCFILE_FILEDIR): os.makedirs(CFG_BIBDOCFILE_FILEDIR) fd, filename = tempfile.mkstemp(suffix='.txt', prefix='test', dir=CFG_BIBDOCFILE_FILEDIR) test = os.fdopen(fd, 'w') test.write('TEST') test.close() if open(filename).read() != 'TEST': raise IOError("Can not successfully write and readback %s" % filename) os.remove(filename) except: register_exception(alert_admin=True) return False return True def post_results_to_callback_url(results, callback_url): write_message("Sending feedback to %s" % callback_url) if not CFG_JSON_AVAILABLE: from warnings import warn warn("--callback-url used but simplejson/json not available") return json_results = json.dumps(results) write_message("Message to send: %s" % json_results, verbose=9) ## :///?# scheme, dummynetloc, dummypath, dummyquery, dummyfragment = urlparse.urlsplit(callback_url) ## See: http://stackoverflow.com/questions/111945/is-there-any-way-to-do-http-put-in-python if scheme == 'http': opener = urllib2.build_opener(urllib2.HTTPHandler) elif scheme == 'https': opener = urllib2.build_opener(urllib2.HTTPSHandler) else: raise ValueError("Scheme not handled %s for callback_url %s" % (scheme, callback_url)) if task_get_option('oracle_friendly'): write_message("Oracle friendly mode requested", verbose=9) request = urllib2.Request(callback_url, data=urllib.urlencode({'results': json_results})) request.add_header('Content-Type', 'application/x-www-form-urlencoded') else: request = urllib2.Request(callback_url, data=json_results) request.add_header('Content-Type', 'application/json') request.add_header('User-Agent', make_user_agent_string('BibUpload')) write_message("Headers about to be sent: %s" % request.headers, verbose=9) write_message("Data about to be sent: %s" % request.data, verbose=9) res = opener.open(request) msg = res.read() write_message("Result of posting the feedback: %s %s" % (res.code, res.msg), verbose=9) write_message("Returned message is: %s" % msg, verbose=9) return res def bibupload_records(records, opt_mode=None, opt_notimechange=0, pretend=False, callback_url=None, results_for_callback=None): """perform the task of uploading a set of records returns list of (error_code, recid) tuples for separate records """ #Dictionaries maintaining temporary identifiers # Structure: identifier -> number tmp_ids = {} tmp_vers = {} results = [] # The first phase -> assigning meaning to temporary identifiers if opt_mode == 'reference': ## NOTE: reference mode has been deprecated in favour of 'correct' opt_mode = 'correct' record = None for record in records: record_id = record_extract_oai_id(record) task_sleep_now_if_required(can_stop_too=True) if opt_mode == "holdingpen": #inserting into the holding pen write_message("Inserting into holding pen", verbose=3) insert_record_into_holding_pen(record, record_id, pretend=pretend) else: write_message("Inserting into main database", verbose=3) error = bibupload( record, opt_mode = opt_mode, opt_notimechange = opt_notimechange, oai_rec_id = record_id, pretend = pretend, tmp_ids = tmp_ids, tmp_vers = tmp_vers) results.append(error) if error[0] == 1: if record: write_message(lambda: record_xml_output(record), stream=sys.stderr) else: write_message("Record could not have been parsed", stream=sys.stderr) stat['nb_errors'] += 1 if callback_url: results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]}) elif error[0] == 2: if record: write_message(lambda: record_xml_output(record), stream=sys.stderr) else: write_message("Record could not have been parsed", stream=sys.stderr) if callback_url: results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]}) elif error[0] == 0: if callback_url: from invenio.search_engine import print_record results_for_callback['results'].append({'recid': error[1], 'success': True, "marcxml": print_record(error[1], 'xm'), 'url': "%s/%s/%s" % (CFG_SITE_URL, CFG_SITE_RECORD, error[1])}) else: if callback_url: results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]}) # stat us a global variable task_update_progress("Done %d out of %d." % \ (stat['nb_records_inserted'] + \ stat['nb_records_updated'], stat['nb_records_to_upload'])) # Second phase -> Now we can process all entries where temporary identifiers might appear (BDR, BDM) write_message("Identifiers table after processing: %s versions: %s" % (str(tmp_ids), str(tmp_vers)), verbose=2) write_message("Uploading BDR and BDM fields") if opt_mode != "holdingpen": for record in records: record_id = retrieve_rec_id(record, opt_mode, pretend=pretend, post_phase = True) bibupload_post_phase(record, rec_id = record_id, mode = opt_mode, pretend = pretend, tmp_ids = tmp_ids, tmp_vers = tmp_vers) return results def task_run_core(): """ Reimplement to add the body of the task.""" write_message("Input file '%s', input mode '%s'." % (task_get_option('file_path'), task_get_option('mode'))) write_message("STAGE 0:", verbose=2) if task_get_option('file_path') is not None: write_message("start preocessing", verbose=3) task_update_progress("Reading XML input") recs = xml_marc_to_records(open_marc_file(task_get_option('file_path'))) stat['nb_records_to_upload'] = len(recs) write_message(" -Open XML marc: DONE", verbose=2) task_sleep_now_if_required(can_stop_too=True) write_message("Entering records loop", verbose=3) callback_url = task_get_option('callback_url') results_for_callback = {'results': []} if recs is not None: # We proceed each record by record bibupload_records(records=recs, opt_mode=task_get_option('mode'), opt_notimechange=task_get_option('notimechange'), pretend=task_get_option('pretend'), callback_url=callback_url, results_for_callback=results_for_callback) else: write_message(" ERROR: bibupload failed: No record found", verbose=1, stream=sys.stderr) callback_url = task_get_option("callback_url") if callback_url: nonce = task_get_option("nonce") if nonce: results_for_callback["nonce"] = nonce post_results_to_callback_url(results_for_callback, callback_url) if task_get_task_param('verbose') >= 1: # Print out the statistics print_out_bibupload_statistics() # Check if they were errors return not stat['nb_errors'] >= 1 def log_record_uploading(oai_rec_id, task_id, bibrec_id, insertion_db, pretend=False): if oai_rec_id != "" and oai_rec_id != None: query = """UPDATE oaiHARVESTLOG SET date_inserted=NOW(), inserted_to_db=%s, id_bibrec=%s WHERE oai_id = %s AND bibupload_task_id = %s ORDER BY date_harvested LIMIT 1""" if not pretend: run_sql(query, (str(insertion_db), str(bibrec_id), str(oai_rec_id), str(task_id), )) if __name__ == "__main__": main() diff --git a/modules/bibupload/lib/bibupload_regression_tests.py b/modules/bibupload/lib/bibupload_regression_tests.py index c8af4285b..fba667b68 100644 --- a/modules/bibupload/lib/bibupload_regression_tests.py +++ b/modules/bibupload/lib/bibupload_regression_tests.py @@ -1,5744 +1,5744 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 CERN. +# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable=C0301 """Regression tests for the BibUpload.""" __revision__ = "$Id$" import re from invenio.testutils import InvenioTestCase import os import time import sys import zlib from marshal import loads from zlib import decompress from urllib import urlencode from urllib2 import urlopen import pprint if sys.hexversion < 0x2060000: from md5 import md5 else: from hashlib import md5 # pylint: disable=E0611 from invenio.config import CFG_OAI_ID_FIELD, CFG_PREFIX, CFG_SITE_URL, CFG_TMPDIR, \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, \ CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG, \ CFG_BINDIR, \ CFG_SITE_RECORD, \ CFG_DEVEL_SITE, \ CFG_BIBUPLOAD_REFERENCE_TAG, \ CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE from invenio.access_control_config import CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS from invenio import bibupload from invenio.search_engine import print_record, get_record from invenio.jsonutils import json from invenio.dbquery import run_sql, get_table_status_info from invenio.dateutils import convert_datestruct_to_datetext from invenio.testutils import make_test_suite, run_test_suite, test_web_page_content from invenio.textutils import encode_for_xml from invenio.bibtask import task_set_task_param, setup_loggers, task_set_option, task_low_level_submission from invenio.bibrecord import record_has_field,record_get_field_value, records_identical, create_record from invenio.shellutils import run_shell_command from invenio.bibdocfile import BibRecDocs, BibRelation, MoreInfo import base64 import cPickle # helper functions: RE_005 = re.compile(re.escape('tag="005"')) def get_record_from_bibxxx(recid): """Return a recstruct built from bibxxx tables""" record = "" record += """ %s\n""" % recid # controlfields query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\ "WHERE bb.id_bibrec=%s AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\ "ORDER BY bb.field_number, b.tag ASC" res = run_sql(query, (recid, )) for row in res: field, value = row[0], row[1] value = encode_for_xml(value) record += """ %s\n""" % \ (encode_for_xml(field[0:3]), value) # datafields i = 1 # Do not process bib00x and bibrec_bib00x, as # they are controlfields. So start at bib01x and # bibrec_bib00x (and set i = 0 at the end of # first loop) for digit1 in range(0, 10): for digit2 in range(i, 10): bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s"\ "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx) res = run_sql(query, (recid, str(digit1)+str(digit2)+'%')) field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] if ind1 == "_" or ind1 == "": ind1 = " " if ind2 == "_" or ind2 == "": ind2 = " " if field_number != field_number_old or field[:-1] != field_old[:-1]: if field_number_old != -999: record += """ \n""" record += """ \n""" % \ (encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2)) field_number_old = field_number field_old = field # print subfield value value = encode_for_xml(value) record += """ %s\n""" % \ (encode_for_xml(field[-1:]), value) # all fields/subfields printed in this run, so close the tag: if field_number_old != -999: record += """ \n""" i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x # we are at the end of printing the record: record += " \n" return record def remove_tag_001_from_xmbuffer(xmbuffer): """Remove tag 001 from MARCXML buffer. Useful for testing two MARCXML buffers without paying attention to recIDs attributed during the bibupload. """ return re.sub(r'.*', '', xmbuffer) def compare_xmbuffers(xmbuffer1, xmbuffer2): """Compare two XM (XML MARC) buffers by removing whitespaces and version numbers in tags 005 before testing. """ def remove_blanks_from_xmbuffer(xmbuffer): """Remove \n and blanks from XMBUFFER.""" out = xmbuffer.replace("\n", "") out = out.replace(" ", "") return out # remove 005 revision numbers: xmbuffer1 = re.sub(r'.*?', '', xmbuffer1) xmbuffer2 = re.sub(r'.*?', '', xmbuffer2) # remove whitespace: xmbuffer1 = remove_blanks_from_xmbuffer(xmbuffer1) xmbuffer2 = remove_blanks_from_xmbuffer(xmbuffer2) if len(RE_005.findall(xmbuffer1)) > 1: return "More than 1 005 tag has been found in the first XM: %s" % xmbuffer1 if len(RE_005.findall(xmbuffer2)) > 1: return "More than 1 005 tag has been found in the second XM: %s" % xmbuffer2 if xmbuffer1 != xmbuffer2: return "\n=" + xmbuffer1 + "=\n" + '!=' + "\n=" + xmbuffer2 + "=\n" return '' def remove_tag_001_from_hmbuffer(hmbuffer): """Remove tag 001 from HTML MARC buffer. Useful for testing two HTML MARC buffers without paying attention to recIDs attributed during the bibupload. """ return re.sub(r'(^|\n)(
)?[0-9]{9}\s001__\s\d+($|\n)', '', hmbuffer)
 
 def compare_hmbuffers(hmbuffer1, hmbuffer2):
     """Compare two HM (HTML MARC) buffers by removing whitespaces
        before testing.
     """
 
     hmbuffer1 = hmbuffer1.strip()
     hmbuffer2 = hmbuffer2.strip()
 
     # remove eventual 
...
formatting: hmbuffer1 = re.sub(r'^
', '', hmbuffer1)
     hmbuffer2 = re.sub(r'^
', '', hmbuffer2)
     hmbuffer1 = re.sub(r'
$', '', hmbuffer1) hmbuffer2 = re.sub(r'
$', '', hmbuffer2) # remove 005 revision numbers: hmbuffer1 = re.sub(r'(^|\n)[0-9]{9}\s005.*($|\n)', '\n', hmbuffer1) hmbuffer2 = re.sub(r'(^|\n)[0-9]{9}\s005.*($|\n)', '\n', hmbuffer2) hmbuffer1 = hmbuffer1.strip() hmbuffer2 = hmbuffer2.strip() # remove leading recid, leaving only field values: hmbuffer1 = re.sub(r'(^|\n)[0-9]{9}\s', '', hmbuffer1) hmbuffer2 = re.sub(r'(^|\n)[0-9]{9}\s', '', hmbuffer2) # remove leading whitespace: hmbuffer1 = re.sub(r'(^|\n)\s+', '', hmbuffer1) hmbuffer2 = re.sub(r'(^|\n)\s+', '', hmbuffer2) compared_hmbuffers = hmbuffer1 == hmbuffer2 if not compared_hmbuffers: return "\n=" + hmbuffer1 + "=\n" + '!=' + "\n=" + hmbuffer2 + "=\n" return '' def wipe_out_record_from_all_tables(recid): """ Wipe out completely the record and all its traces of RECID from the database (bibrec, bibrec_bibxxx, bibxxx, bibfmt). Useful for the time being for test cases. """ # delete all the linked bibdocs try: for bibdoc in BibRecDocs(recid).list_bibdocs(): bibdoc.expunge() # delete from bibrec: run_sql("DELETE FROM bibrec WHERE id=%s", (recid,)) # delete from bibrec_bibxxx: for i in range(0, 10): for j in range(0, 10): run_sql("DELETE FROM %(bibrec_bibxxx)s WHERE id_bibrec=%%s" % # kwalitee: disable=sql {'bibrec_bibxxx': "bibrec_bib%i%ix" % (i, j)}, (recid,)) # delete all unused bibxxx values: for i in range(0, 10): for j in range(0, 10): run_sql("DELETE %(bibxxx)s FROM %(bibxxx)s " \ " LEFT JOIN %(bibrec_bibxxx)s " \ " ON %(bibxxx)s.id=%(bibrec_bibxxx)s.id_bibxxx " \ " WHERE %(bibrec_bibxxx)s.id_bibrec IS NULL" % \ {'bibxxx': "bib%i%ix" % (i, j), 'bibrec_bibxxx': "bibrec_bib%i%ix" % (i, j)}) # delete from bibfmt: run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s", (recid,)) # delete from bibrec_bibdoc: run_sql("DELETE FROM bibrec_bibdoc WHERE id_bibrec=%s", (recid,)) # delete from holdingpen run_sql("DELETE FROM bibHOLDINGPEN WHERE id_bibrec=%s", (recid,)) # delete from hstRECORD run_sql("DELETE FROM hstRECORD WHERE id_bibrec=%s", (recid,)) except Exception, err: print >> sys.stderr, "Exception captured while wiping records: %s" % err def try_url_download(url): """Try to download a given URL""" try: open_url = urlopen(url) open_url.read() except Exception, e: raise StandardError("Downloading %s is impossible because of %s" % (url, str(e))) return True class GenericBibUploadTest(InvenioTestCase): """Generic BibUpload testing class with predefined setUp and tearDown methods. """ def setUp(self): self.verbose = 0 setup_loggers() task_set_task_param('verbose', self.verbose) self.last_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0] self.tear_down = True ## For debugging, whether to call tearDown self.webcolled_recids = [] ## List of record webcolled to be re-webcolled upon tearDown def tearDown(self): if self.tear_down: for recid in run_sql("SELECT id FROM bibrec WHERE id>%s", (self.last_recid,)): wipe_out_record_from_all_tables(recid[0]) for recid in list(self.webcolled_recids): self.force_webcoll(recid) def force_webcoll(self, recid): self.webcolled_recids.append(recid) from invenio.bibindex_engine_config import CFG_BIBINDEX_INDEX_TABLE_TYPE from invenio import bibindex_engine from invenio import websearch_webcoll ## Reset the collection global cache websearch_webcoll.COLLECTION_HOUSE = {} bibindex_engine.WordTable("collection", table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"] ).add_recIDs([[recid, recid]], 1) #sleep 1s to make sure all tables are ready time.sleep(1) c = websearch_webcoll.Collection() c.calculate_reclist() c.update_reclist() def check_record_consistency(self, recid): rec_in_history = create_record(decompress(run_sql("SELECT marcxml FROM hstRECORD WHERE id_bibrec=%s ORDER BY job_date DESC LIMIT 1", (recid, ))[0][0]))[0] rec_in_xm = create_record(decompress(run_sql("SELECT value FROM bibfmt WHERE id_bibrec=%s AND format='xm'", (recid, ))[0][0]))[0] rec_in_bibxxx = create_record(get_record_from_bibxxx(recid))[0] self.failUnless(records_identical(rec_in_xm, rec_in_history, skip_005=False), "\n%s\n!=\n%s\n" % (rec_in_xm, rec_in_history)) self.failUnless(records_identical(rec_in_xm, rec_in_bibxxx, skip_005=False, ignore_duplicate_subfields=True, ignore_duplicate_controlfields=True), "\n%s\n!=\n%s\n" % (rec_in_xm, rec_in_bibxxx)) if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE: rec_in_recstruct = loads(decompress(run_sql("SELECT value FROM bibfmt WHERE id_bibrec=%s AND format='recstruct'", (recid, ))[0][0])) self.failUnless(records_identical(rec_in_xm, rec_in_recstruct, skip_005=False, ignore_subfield_order=True), "\n%s\n!=\n%s\n" % (rec_in_xm, rec_in_recstruct)) class BibUploadRealCaseRemovalDOIViaBibEdit(GenericBibUploadTest): def test_removal_of_doi_via_bibedit(self): test = """ HEP Fiore, Gaetano On quantum mechanics with a magnetic field on R**n and on a torus T**n, and their relation Int.J.Theor.Phys. 52 877-896 2013 INSPIRE General Physics Published 20 2013 author Bloch theory with magnetic field author Fiber bundles author Gauge symmetry author Quantization on manifolds Springer We show in elementary terms the equivalence in a general gauge of a U(1)-gauge theory of a scalar charged particle on a torus to the analogous theory on ℝ( )n( ) constrained by quasiperiodicity under translations in the lattice Λ. The latter theory provides a global description of the former: the quasiperiodic wavefunctions ψ defined on ℝ( )n( ) play the role of sections of the associated hermitean line bundle E on , since also E admits a global description as a quotient. The components of the covariant derivatives corresponding to a constant (necessarily integral) magnetic field B=dA generate a Lie algebra g ( )Q( ) and together with the periodic functions the algebra of observables . The non-abelian part of g ( )Q( ) is a Heisenberg Lie algebra with the electric charge operator Q as the central generator, the corresponding Lie group G ( )Q( ) acts on the Hilbert space as the translation group up to phase factors. Also the space of sections of E is mapped into itself by g∈G ( )Q( ). We identify the socalled magnetic translation group as a subgroup of the observables’ group Y ( )Q( ). We determine the unitary irreducible representations of corresponding to integer charges and for each of them an associated orthonormal basis explicitly in configuration space. We also clarify how in the n=2m case a holomorphic structure and Theta functions arise on the associated complex torus. DOI 10.1007/s10773-012-1396-z Fiore:2013nua INSPIRETeX Published Citeable """ recs = create_record(test) _, recid, _ = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid) new_rec = get_record(recid) del new_rec['024'] ## let's delete DOI _, recid2, _ = bibupload.bibupload(new_rec, opt_mode='replace') self.assertEqual(recid, recid2) self.check_record_consistency(recid2) class BibUploadTypicalBibEditSessionTest(GenericBibUploadTest): """Testing a typical BibEdit session""" def setUp(self): GenericBibUploadTest.setUp(self) self.test = """ SzGeCERN Test, Jane Test Institute Test, John Test University Cool Test, Jim Test Laboratory """ recs = bibupload.xml_marc_to_records(self.test) # We call the main function with the record as a parameter _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(self.recid) # We retrieve the inserted xml inserted_xm = print_record(self.recid, 'xm') # Compare if the two MARCXML are the same self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(inserted_xm), self.test), '') self.history = run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, )) # kwalitee: disable=sql self.timestamp = run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,)) self.tag005 = get_record(self.recid)['005'][0][3] def test_simple_replace(self): """BibUpload - test a simple replace as in BibEdit""" marc_to_replace1 = """ %(recid)s %(tag005)s SzGeCERN Test, Foo Test Institute Test, John Test University Cool Test, Jim Test Laboratory bla bla bla """ % {'recid': self.recid, 'tag005': self.tag005} recs = bibupload.xml_marc_to_records(marc_to_replace1) # We call the main function with the record as a parameter _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.check_record_consistency(self.recid) ## The change should have been applied! self.failUnless(records_identical(recs[0], get_record(self.recid)), "\n%s\n!=\n%s\n" % (recs[0], get_record(self.recid))) marc_to_replace2 = """ %(recid)s %(tag005)s SzGeCERN Test, Jane Test Institute Test, John Test University Cool Test, Jim Test Laboratory Queen Elisabeth Great Britain """ % {'recid': self.recid, 'tag005': self.tag005} expected_marc = """ %(recid)s %(tag005)s SzGeCERN Test, Foo Test Institute Test, John Test University Cool Test, Jim Test Laboratory bla bla bla Queen Elisabeth Great Britain """ % {'recid': self.recid, 'tag005': self.tag005} recs = bibupload.xml_marc_to_records(marc_to_replace2) # We call the main function with the record as a parameter _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.check_record_consistency(self.recid) ## The change should have been merged with the previous without conflict self.failUnless(records_identical(bibupload.xml_marc_to_records(expected_marc)[0], get_record(self.recid))) def test_replace_with_conflict(self): """BibUpload - test a replace as in BibEdit that leads to conflicts""" marc_to_replace1 = """ %(recid)s %(tag005)s SzGeCERN Test, Foo Test Institute2 Test, John Test University Cool Test, Jim Test Laboratory bla bla bla """ % {'recid': self.recid, 'tag005': self.tag005} recs = bibupload.xml_marc_to_records(marc_to_replace1) # We call the main function with the record as a parameter _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.check_record_consistency(self.recid) ## The change should have been applied! self.failUnless(records_identical(recs[0], get_record(self.recid)), "\n%s\n!=\n%s" % (recs[0], get_record(self.recid))) marc_to_replace2 = """ %(recid)s %(tag005)s SzGeCERN Queen Elisabeth Great Britain Test, John Test University No more Cool Test, Jim Test Laboratory bla bla bla """ % {'recid': self.recid, 'tag005': self.tag005} recs = bibupload.xml_marc_to_records(marc_to_replace2) # We call the main function with the record as a parameter _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.check_record_consistency(self.recid) ## The change should have been merged with the previous without conflict self.failUnless(records_identical(bibupload.xml_marc_to_records(marc_to_replace1)[0], get_record(self.recid)), "%s != %s" % (bibupload.xml_marc_to_records(marc_to_replace1)[0], get_record(self.recid))) self.failUnless(records_identical(bibupload.xml_marc_to_records(marc_to_replace2)[0], bibupload.xml_marc_to_records(zlib.decompress(run_sql("SELECT changeset_xml FROM bibHOLDINGPEN WHERE id_bibrec=%s", (self.recid,))[0][0]))[0])) class BibUploadNoUselessHistoryTest(GenericBibUploadTest): """Testing generation of history only when necessary""" def setUp(self): GenericBibUploadTest.setUp(self) self.test = """ SzGeCERN Test, Jane Test Institute Test, John Test University Cool Test, Jim Test Laboratory """ recs = bibupload.xml_marc_to_records(self.test) # We call the main function with the record as a parameter _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(self.recid) # We retrieve the inserted xml inserted_xm = print_record(self.recid, 'xm') # Compare if the two MARCXML are the same self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(inserted_xm), self.test), '') self.history = run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, )) # kwalitee: disable=sql self.timestamp = run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,)) def test_replace_identical_record(self): """bibupload - replace with identical record does not touch history""" xml_to_upload = """ %s SzGeCERN Test, Jane Test Institute Test, John Test University Cool Test, Jim Test Laboratory """ % self.recid recs = bibupload.xml_marc_to_records(xml_to_upload) # We call the main function with the record as a parameter _, recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.check_record_consistency(recid) self.assertEqual(self.recid, recid) self.assertEqual(self.history, run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) # kwalitee: disable=sql self.assertEqual(self.timestamp, run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,))) def test_correct_identical_correction(self): """bibupload - correct with identical correction does not touch history""" xml_to_upload = """ %s SzGeCERN """ % self.recid recs = bibupload.xml_marc_to_records(xml_to_upload) # We call the main function with the record as a parameter _, recid, _ = bibupload.bibupload_records(recs, opt_mode='correct')[0] self.check_record_consistency(recid) self.assertEqual(self.recid, recid) self.maxDiff = None self.assertEqual(self.history, run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) # kwalitee: disable=sql self.assertEqual(self.timestamp, run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,))) def test_replace_different_record(self): """bibupload - replace with different records does indeed touch history""" xml_to_upload = """ %s SzGeCERN Test, Jane Test Institute Test, John Test University Test, Jim Test Laboratory """ % self.recid recs = bibupload.xml_marc_to_records(xml_to_upload) # We call the main function with the record as a parameter _, recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.check_record_consistency(recid) self.assertEqual(self.recid, recid) self.assertNotEqual(self.history, run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) # kwalitee: disable=sql self.failUnless(len(self.history) == 1 and len(run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) == 2) # kwalitee: disable=sql self.assertNotEqual(self.timestamp, run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,))) def test_correct_different_correction(self): """bibupload - correct with different correction does indeed touch history""" xml_to_upload = """ %s FooBar """ % self.recid recs = bibupload.xml_marc_to_records(xml_to_upload) # We call the main function with the record as a parameter _, recid, _ = bibupload.bibupload_records(recs, opt_mode='correct')[0] self.check_record_consistency(recid) self.assertEqual(self.recid, recid) self.assertNotEqual(self.history, run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) # kwalitee: disable=sql self.failUnless(len(self.history) == 1 and len(run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) == 2) # kwalitee: disable=sql self.assertNotEqual(self.timestamp, run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,))) class BibUploadCallbackURLTest(GenericBibUploadTest): """Testing usage of CLI callback_url""" def setUp(self): GenericBibUploadTest.setUp(self) self.test = """ something Tester, J Y MIT Tester, K J CERN2 Tester, G CERN3 test11 test31 test12 test32 test13 test33 test21 test41 test22 test42 test14 test51 test52 Tester, T CERN """ self.testfile_path = os.path.join(CFG_TMPDIR, 'bibupload_regression_test_input.xml') open(self.testfile_path, "w").write(self.test) self.resultfile_path = os.path.join(CFG_TMPDIR, 'bibupload_regression_test_result.json') if CFG_DEVEL_SITE: def test_simple_insert_callback_url(self): """bibupload - --callback-url with simple insert""" # taskid = task_low_level_submission('bibupload', 'test', '-i', self.testfile_path, '--callback-url', CFG_SITE_URL + '/httptest/post2?%s' % urlencode({"save": self.resultfile_path}), '-v0') run_shell_command(CFG_BINDIR + '/bibupload %s', [str(taskid)]) results = json.loads(open(self.resultfile_path).read()) self.failUnless('results' in results) self.assertEqual(len(results['results']), 1) self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""Tester, J Y""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) class BibUploadBibRelationsTest(GenericBibUploadTest): def setUp(self): GenericBibUploadTest.setUp(self) self.upload_xml = """ A very wise author %(url_site)s/img/user-icon-1-20x20.gif Main docname TMP:id_identifier1 TMP:ver_identifier1 %(url_site)s/record/8/files/9812226.pdf?version=1 Main docname2 TMP:id_identifier2 TMP:ver_identifier2 TMP:id_identifier1 TMP:ver_identifier1 TMP:id_identifier2 TMP:ver_identifier2 is_extracted_from """ % {'url_site' : CFG_SITE_URL} def test_upload_with_tmpids(self): """bibupload - Trying to upload a relation between two new documents ... and then to delete""" recs = bibupload.xml_marc_to_records(self.upload_xml) _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] # ertrive document numbers and check if there exists a relation between them brd = BibRecDocs(recid) docs = brd.list_bibdocs() self.assertEqual(2, len(docs), "Incorrect number of documents attached to a record") rels = docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from") self.assertEqual(1, len(rels), "Incorrect number of relations retrieved from the first document") rels = docs[1].get_incoming_relations("is_extracted_from") + docs[1].get_outgoing_relations("is_extracted_from") self.assertEqual(1, len(rels), "Incorrect number of relations retrieved from the second document") created_relation_id = rels[0].id rels = docs[0].get_incoming_relations("different_type_of_relation") + docs[0].get_outgoing_relations("different_type_of_relation") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document") upload_xml_2 = """ %(rec_id)s %(rel_id)s DELETE """ % {'rel_id' : created_relation_id, 'rec_id' : recid} recs = bibupload.xml_marc_to_records(upload_xml_2) bibupload.bibupload_records(recs, opt_mode='correct')[0] brd = BibRecDocs(recid) docs = brd.list_bibdocs() self.assertEqual(2, len(docs), "Incorrect number of documents attached to a record") rels = docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document") rels = docs[1].get_incoming_relations("is_extracted_from") + docs[1].get_outgoing_relations("is_extracted_from") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the second document") rels = docs[0].get_incoming_relations("different_type_of_relation") + docs[0].get_outgoing_relations("different_type_of_relation") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document") def test_delete_by_docids(self): """bibupload - delete relation entry by the docid inside the currently modified record Uploading a sample relation and trying to modify it by refering to other parameters than the relation number""" recs = bibupload.xml_marc_to_records(self.upload_xml) dummyerr, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] brd = BibRecDocs(recid) docs = brd.list_bibdocs() self.assertEqual(2, len(docs), "Incorrect number of attached documents") rel = (docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from"))[0] upload_xml_2 = """ %(rec_id)s %(first_docid)s %(first_docver)s %(second_docid)s %(second_docver)s is_extracted_from DELETE """ % { 'rec_id' : recid, 'first_docid': rel.bibdoc1_id, 'first_docver' : rel.bibdoc1_ver, 'second_docid': rel.bibdoc2_id, 'second_docver' : rel.bibdoc2_ver} recs = bibupload.xml_marc_to_records(upload_xml_2) bibupload.bibupload_records(recs, opt_mode='correct')[0] brd = BibRecDocs(recid) docs = brd.list_bibdocs() self.assertEqual(2, len(docs), "Incorrect number of documents attached to a record") rels = docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document") rels = docs[1].get_incoming_relations("is_extracted_from") + docs[1].get_outgoing_relations("is_extracted_from") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the second document") rels = docs[0].get_incoming_relations("different_type_of_relation") + docs[0].get_outgoing_relations("different_type_of_relation") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document") def test_remove_by_name(self): """bibupload - trying removing relation by providing bibdoc names rather than relation numbers""" recs = bibupload.xml_marc_to_records(self.upload_xml) _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] brd = BibRecDocs(recid) docs = brd.list_bibdocs() self.assertEqual(2, len(docs), "Incorrect number of attached documents") rel = (docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from"))[0] upload_xml_2 = """ %(rec_id)s docname %(first_docver)s docname2 %(second_docver)s is_extracted_from DELETE """ % {'rec_id' : recid, 'first_docver' : rel.bibdoc1_ver, 'second_docver' : rel.bibdoc2_ver} # the above is incorrect ! we assert that nothing has been removed recs = bibupload.xml_marc_to_records(upload_xml_2) _ = bibupload.bibupload_records(recs, opt_mode='correct')[0] brd = BibRecDocs(recid) docs = brd.list_bibdocs() self.assertEqual(2, len(docs), "Incorrect number of documents attached to a record") rels = docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document") rels = docs[1].get_incoming_relations("is_extracted_from") + docs[1].get_outgoing_relations("is_extracted_from") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the second document") rels = docs[0].get_incoming_relations("different_type_of_relation") + docs[0].get_outgoing_relations("different_type_of_relation") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document") def test_remove_by_name_incorrect(self): """bibupload - trying removing relation by providing bibdoc names rather than relation numbers, but providing incorrect name""" recs = bibupload.xml_marc_to_records(self.upload_xml) _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] brd = BibRecDocs(recid) docs = brd.list_bibdocs() self.assertEqual(2, len(docs), "Incorrect number of attached documents") rel = (docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from"))[0] upload_xml_2 = """ %(rec_id)s docname1 %(first_docver)s docname2 %(second_docver)s is_extracted_from DELETE """ % { 'rec_id' : recid, 'first_docver' : rel.bibdoc1_ver, 'second_docver' : rel.bibdoc2_ver} # the above is incorrect ! we assert that nothing has been removed recs = bibupload.xml_marc_to_records(upload_xml_2) _ = bibupload.bibupload_records(recs, opt_mode='correct')[0] brd = BibRecDocs(recid) docs = brd.list_bibdocs() self.assertEqual(2, len(docs), "Incorrect number of documents attached to a record") rels = docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from") self.assertEqual(1, len(rels), "Incorrect number of relations retrieved from the first document") rels = docs[1].get_incoming_relations("is_extracted_from") + docs[1].get_outgoing_relations("is_extracted_from") self.assertEqual(1, len(rels), "Incorrect number of relations retrieved from the second document") rels = docs[0].get_incoming_relations("different_type_of_relation") + docs[0].get_outgoing_relations("different_type_of_relation") self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document") def _upload_initial_moreinfo_key(self): """Prepare MoreInfo with sample keys and check it has been correctly uploaded uploaded dic: {"ns1" : {"k1":"val1", "k2":[1,2,3,"something"], "k3" : (1,3,2,"something else"), "k4" : {"a":"b", 1:2}}} ... after encoding gives KGRwMQpTJ25zMScKcDIKKGRwMwpTJ2szJwpwNAooSTEKSTMKSTIKUydzb21ldGhpbmcgZWxzZScKdHA1CnNTJ2syJwpwNgoobHA3CkkxCmFJMgphSTMKYVMnc29tZXRoaW5nJwpwOAphc1MnazEnCnA5ClMndmFsMScKcDEwCnNTJ2s0JwpwMTEKKGRwMTIKUydhJwpTJ2InCnNJMQpJMgpzc3Mu """ moreinfo_str = "KGRwMQpTJ25zMScKcDIKKGRwMwpTJ2szJwpwNAooSTEKSTMKSTIKUydzb21ldGhpbmcgZWxzZScKdHA1CnNTJ2syJwpwNgoobHA3CkkxCmFJMgphSTMKYVMnc29tZXRoaW5nJwpwOAphc1MnazEnCnA5ClMndmFsMScKcDEwCnNTJ2s0JwpwMTEKKGRwMTIKUydhJwpTJ2InCnNJMQpJMgpzc3Mu" xml_to_upload = """ A very wise author %(url_site)s/img/user-icon-1-20x20.gif Main docname TMP:id_identifier1 TMP:ver_identifier1 %(url_site)s/record/8/files/9812226.pdf?version=1 Main docname2 TMP:id_identifier2 TMP:ver_identifier2 TMP:id_identifier1 TMP:ver_identifier1 TMP:id_identifier2 TMP:ver_identifier2 is_extracted_from %(moreinfo_str)s """ % {'url_site' : CFG_SITE_URL, 'moreinfo_str' : moreinfo_str} recs = bibupload.xml_marc_to_records(xml_to_upload) dummyerr, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] brd = BibRecDocs(recid) docs = brd.list_bibdocs() self.assertEqual(2, len(docs), "Incorrect number of attached documents") return ((docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from"))[0], recid) def test_add_relation_moreinfo_key(self): """bibupload - upload new MoreInfo key into the dictionary related to a relation""" rel, _ = self._upload_initial_moreinfo_key() # asserting correctness of data self.assertEqual(rel.more_info.get_data("ns1", "k1"), "val1", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k1)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[0], 1, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[2], 3, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[3], "something", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k3"), (1,3,2,"something else") , "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k3)") self.assertEqual(rel.more_info.get_data("ns1", "k4")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)") self.assertEqual(rel.more_info.get_data("ns1", "k4")["a"], "b", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)") def test_modify_relation_moreinfo_key(self): """bibupload - modify existing MoreInfo key """ #the update : {"ns1":{"k1": "different value"}} rel, recid = self._upload_initial_moreinfo_key() moreinfo_str = "KGRwMQpTJ25zMScKcDIKKGRwMwpTJ2sxJwpwNApTJ2RpZmZlcmVudCB2YWx1ZScKcDUKc3Mu" upload_xml = """ %(rec_id)s docname docname2 1 1 is_extracted_from %(moreinfo_str)s """ % {"rec_id" : recid, "moreinfo_str": moreinfo_str} recs = bibupload.xml_marc_to_records(upload_xml) bibupload.bibupload_records(recs, opt_mode='correct')[0] rel = BibRelation(rel_id = rel.id) self.assertEqual(rel.more_info.get_data("ns1", "k1"), "different value", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k1)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[0], 1, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[2], 3, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[3], "something", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k3"), (1,3,2,"something else") , "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k3)") self.assertEqual(rel.more_info.get_data("ns1", "k4")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)") self.assertEqual(rel.more_info.get_data("ns1", "k4")["a"], "b", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)") self.assertEqual(rel.more_info.get_data("ns2", "k4"), None, "Retrieved not none value for nonexisting namespace !") def test_remove_relation_moreinfo_key(self): """bibupload - remove existing MoreInfo key """ #the update : {"ns1":{"k3": None}} rel, recid = self._upload_initial_moreinfo_key() moreinfo_str = "KGRwMQpTJ25zMScKcDIKKGRwMwpTJ2szJwpwNApOc3Mu" upload_xml = """ %(rec_id)s docname docname2 1 1 is_extracted_from %(moreinfo_str)s """ % {"rec_id" : recid, "moreinfo_str": moreinfo_str} recs = bibupload.xml_marc_to_records(upload_xml) bibupload.bibupload_records(recs, opt_mode='correct') rel = BibRelation(rel_id = rel.id) self.assertEqual(rel.more_info.get_data("ns1", "k1"), "val1", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k1)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[0], 1, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[2], 3, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k2")[3], "something", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)") self.assertEqual(rel.more_info.get_data("ns1", "k3"), None , "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k3)") self.assertEqual(rel.more_info.get_data("ns1", "k4")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)") self.assertEqual(rel.more_info.get_data("ns1", "k4")["a"], "b", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)") class BibUploadMoreInfoTest(GenericBibUploadTest): """bibupload - Testing upload of different types of MoreInfo """ def _dict_checker(self, dic, more_info, equal = True): """ Check the more_info for being conform with the dictionary @param equal - The mode of conformity. True means that the dictionary has to be equal with the MoreInfo. False means that dictionary has to be contained in the MoreInfo """ for namespace in dic: for key in dic[namespace]: self.assertEqual(cPickle.dumps(dic[namespace][key]), cPickle.dumps(more_info.get_data(namespace, key)), "Different values for the value of key %s in the namespace %s inside of the MoreInfo object" % \ (namespace, key)) if equal: for namespace in more_info.get_namespaces(): for key in more_info.get_keys(namespace): self.assertTrue(namespace in dic, "namespace %s present in the MoreInfo, but not present in the dictionary" % \ (namespace, )) self.assertTrue(key in dic[namespace], "key %s present in the namespace %s of the MoreInfo but not present in the dictionary" % \ (namespace, key)) self.assertEqual(cPickle.dumps(more_info.get_data(namespace, key)), cPickle.dumps(dic[namespace][key]), "Value for namespace '%s' and key '%s' varies between MoreInfo and the dictionary. moreinfo value: '%s' dictionary value: '%s'" % \ (namespace, key, repr(more_info.get_data(namespace, key)), repr(dic[namespace][key]))) def test_relation_moreinfo_insert(self): """bibupload - Testing the upload of BibRelation and corresponding MoreInfo field""" # Cleaning existing data rels = BibRelation.get_relations(bibdoc1_id = 70, bibdoc2_id = 71, rel_type = "is_extracted_from") for rel in rels: rel.delete() # Uploading relation_upload_template = """ 70 71 is_extracted_from %s Some author """ data_to_insert = {"first namespace": {"k1" : "val1", "k2" : "val2"}, "second" : {"k1" : "#@$#$@###!!!", "k123": {1:2, 9: (6,2,7)}}} serialised = base64.b64encode(cPickle.dumps(data_to_insert)) recs = bibupload.xml_marc_to_records(relation_upload_template % (serialised, )) bibupload.bibupload_records(recs, opt_mode='insert')[0] # Verifying the correctness of the uploaded data rels = BibRelation.get_relations(bibdoc1_id = 70, bibdoc2_id = 71, rel_type = "is_extracted_from") self.assertEqual(len(rels), 1) rel = rels[0] self.assertEqual(rel.bibdoc1_id, 70) self.assertEqual(rel.bibdoc2_id, 71) self.assertEqual(rel.get_data("first namespace", "k1"), "val1") self.assertEqual(rel.get_data("first namespace", "k2"), "val2") self.assertEqual(rel.get_data("second", "k1"), "#@$#$@###!!!") self.assertEqual(rel.get_data("second", "k123")[1], 2) self.assertEqual(rel.get_data("second", "k123")[9], (6,2,7)) self._dict_checker(data_to_insert, rel.more_info) # Cleaning after the upload ... just in case we have selected more for rel in rels: rel.delete() def _serialise_data(self, data): return base64.b64encode(cPickle.dumps(data)) # Subfield tags used to upload particular types of MoreInfo _mi_bibdoc = "w" _mi_bibdoc_version = "p" _mi_bibdoc_version_format = "b" _mi_bibdoc_format = "u" def _generate_moreinfo_tag(self, mi_type, data): """ """ serialised = self._serialise_data(data) return """%s""" % (mi_type, serialised) def test_document_moreinfo_insert(self): """bibupload - Inserting new MoreInfo to the document 1) Inserting new MoreInfo to new document 2) Inserting new MoreInfo keys existing document version 3) Removing keys from MoreInfo 4) Removing document and asserting, MoreInfo gets removed as well 5) Overriding MoreInfo keys """ moreinfo_upload_template = """ %(siteurl)s/img/site_logo.gif 0106015_01 .jpg restricted_picture %%(additional_content)s Some author """ % {"siteurl": CFG_SITE_URL} sfs = [] sfs.append(self._generate_moreinfo_tag(BibUploadMoreInfoTest._mi_bibdoc, {"first namespace" : {"type": "document moreinfo"}})) sfs.append(self._generate_moreinfo_tag(BibUploadMoreInfoTest._mi_bibdoc_version, {"first namespace" : {"type": "Bibdoc - version moreinfo"}})) sfs.append(self._generate_moreinfo_tag(BibUploadMoreInfoTest._mi_bibdoc_version_format, {"first namespace" : {"type": "Bibdoc - version, format moreinfo"}})) sfs.append(self._generate_moreinfo_tag(BibUploadMoreInfoTest._mi_bibdoc_format, {"first namespace" : {"type": "Bibdoc - format moreinfo"}})) marcxml_1 = moreinfo_upload_template % {"additional_content" : "\n".join(sfs)} recs = bibupload.xml_marc_to_records(marcxml_1) _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] # now checking if all the data has been uploaded correctly bdr = BibRecDocs(recid) doc = bdr.list_bibdocs()[0] docid = doc.get_id() mi_doc = MoreInfo(docid = docid) mi_doc_ver = MoreInfo(docid = docid, version = 1) mi_doc_ver_fmt = MoreInfo(docid = docid, version = 1, docformat=".jpg") mi_doc_fmt = MoreInfo(docid = docid, docformat=".jpg") self._dict_checker({"first namespace" : {"type": "document moreinfo"}}, mi_doc, equal=False) # in case of the document only inclusive check self._dict_checker({"first namespace" : {"type": "Bibdoc - version moreinfo"}}, mi_doc_ver) self._dict_checker({"first namespace" : { "type": "Bibdoc - version, format moreinfo"}}, mi_doc_ver_fmt) self._dict_checker({"first namespace" : {"type": "Bibdoc - format moreinfo"}}, mi_doc_fmt) #now appending to a particular version of MoreInfo # uplad new key to an existing dictionary of a version def _get_mit_template(recid, bibdocid=None, bibdocname=None, version=None, docformat=None, relation=None, data=None): if data is None: ser = None else: ser = base64.b64encode(cPickle.dumps(data)) subfields = [] for s_code, val in (("r", relation), ("i", bibdocid), ("n", bibdocname), ("v", version), ("f", docformat) , ("m", ser)): if not val is None: subfields.append("""%s""" % \ (s_code, val)) return """ %s %s """ % (str(recid), ("\n".join(subfields))) marcxml_2 = _get_mit_template(recid, version = 1, bibdocid = docid, data= {"first namespace" : {"new key": {1:2, 987:678}}}) recs = bibupload.xml_marc_to_records(marcxml_2) bibupload.bibupload_records(recs, opt_mode='append') mi = MoreInfo(docid = docid, version = 1) self._dict_checker({ "first namespace" : {"type": "Bibdoc - version moreinfo", "new key": {1:2, 987:678} } }, mi) #removing the entire old content of the MoreInfo and uploading new data = {"ns1" : {"nk1": 12, "mk1": "this is new content"}, "namespace two" : {"ddd" : "bbb"}} marcxml_3 = _get_mit_template(recid, version = 1, bibdocid = docid, data= data) recs = bibupload.xml_marc_to_records(marcxml_3) bibupload.bibupload_records(recs, opt_mode='correct') mi = MoreInfo(docid = docid, version = 1) self._dict_checker(data, mi) # removing a particular key marcxml_4 = _get_mit_template(recid, version = 1, bibdocid = docid, data= {"ns1": {"nk1" : None}}) recs = bibupload.xml_marc_to_records(marcxml_4) bibupload.bibupload_records(recs, opt_mode='append') mi = MoreInfo(docid = docid, version = 1) self._dict_checker( {"ns1" : { "mk1": "this is new content"}, "namespace two" : {"ddd" : "bbb"}}, mi) # adding new key marcxml_5 = _get_mit_template(recid, version = 1, bibdocid = docid, data= {"ns1": {"newkey" : "newvalue"}}) recs = bibupload.xml_marc_to_records(marcxml_5) bibupload.bibupload_records(recs, opt_mode='append') mi = MoreInfo(docid = docid, version = 1) self._dict_checker( {"ns1" : { "mk1": "this is new content", "newkey" : "newvalue"}, "namespace two" : {"ddd" : "bbb"}}, mi) class BibUploadInsertModeTest(GenericBibUploadTest): """Testing insert mode.""" def setUp(self): # pylint: disable=C0103 """Initialise the MARCXML variable""" GenericBibUploadTest.setUp(self) self.test = """ something Tester, J Y MIT Tester, K J CERN2 Tester, G CERN3 test11 test31 test12 test32 test13 test33 test21 test41 test22 test42 test14 test51 test52 Tester, T CERN """ self.test_hm = """ 100__ $$aTester, T$$uCERN 111__ $$atest11$$ctest31 111__ $$atest12$$ctest32 111__ $$atest13$$ctest33 111__ $$btest21$$dtest41 111__ $$btest22$$dtest42 111__ $$atest14 111__ $$etest51 111__ $$etest52 245__ $$asomething 700__ $$aTester, J Y$$uMIT 700__ $$aTester, K J$$uCERN2 700__ $$aTester, G$$uCERN3 """ def test_create_record_id(self): """bibupload - insert mode, trying to create a new record ID in the database""" rec_id = bibupload.create_new_record() self.assertNotEqual(None, rec_id) def test_create_specific_record_id(self): """bibupload - insert mode, trying to create a new specifc record ID in the database""" expected_rec_id = run_sql("SELECT MAX(id) FROM bibrec")[0][0] + 1 rec_id = bibupload.create_new_record(expected_rec_id) self.assertEqual(rec_id, expected_rec_id) def test_no_retrieve_record_id(self): """bibupload - insert mode, detection of record ID in the input file""" # We create create the record out of the xml marc recs = bibupload.xml_marc_to_records(self.test) # We call the function which should retrieve the record id rec_id = bibupload.retrieve_rec_id(recs[0], 'insert') # We compare the value found with None self.assertEqual(None, rec_id) def test_insert_complete_xmlmarc(self): """bibupload - insert mode, trying to insert complete MARCXML file""" # Initialize the global variable # We create create the record out of the xml marc recs = bibupload.xml_marc_to_records(self.test) # We call the main function with the record as a parameter _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # We retrieve the inserted xml inserted_xm = print_record(recid, 'xm') inserted_hm = print_record(recid, 'hm') inserted_hm = print_record(recid, 'hm') # Compare if the two MARCXML are the same self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(inserted_xm), self.test), '') self.assertEqual(compare_hmbuffers(remove_tag_001_from_hmbuffer(inserted_hm), self.test_hm), '') def test_retrieve_005_tag(self): """bibupload - insert mode, verifying insertion of 005 control field for record """ # Convert marc xml into record structure recs = bibupload.xml_marc_to_records(self.test) dummy, recid, dummy = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid) # Retrive the inserted record based on the record id rec = get_record(recid) # We retrieve the creationdate date from the database query = """SELECT DATE_FORMAT(last_updated,'%%Y%%m%%d%%H%%i%%s') FROM bibfmt where id_bibrec=%s AND format='xm'""" res = run_sql(query, (recid, )) self.assertEqual(record_has_field(rec, '005'), True) self.assertEqual(str(res[0][0]) + '.0', record_get_field_value(rec, '005', '', '')) class BibUploadAppendModeTest(GenericBibUploadTest): """Testing append mode.""" def setUp(self): # pylint: disable=C0103 """Initialize the MARCXML variable""" GenericBibUploadTest.setUp(self) self.test_existing = """ 123456789 Tester, T DESY 0003719PHOPHO """ self.test_to_append = """ 123456789 Tester, U CERN 0003719PHOPHO """ self.test_expected_xm = """ 123456789 Tester, T DESY Tester, U CERN 0003719PHOPHO """ self.test_expected_hm = """ 001__ 123456789 100__ $$aTester, T$$uDESY 100__ $$aTester, U$$uCERN 970__ $$a0003719PHOPHO """ # insert test record: test_to_upload = self.test_existing.replace('123456789', '') recs = bibupload.xml_marc_to_records(test_to_upload) _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) self.test_recid = recid # replace test buffers with real recid of inserted test record: self.test_existing = self.test_existing.replace('123456789', str(self.test_recid)) self.test_to_append = self.test_to_append.replace('123456789', str(self.test_recid)) self.test_expected_xm = self.test_expected_xm.replace('123456789', str(self.test_recid)) self.test_expected_hm = self.test_expected_hm.replace('123456789', str(self.test_recid)) def test_retrieve_record_id(self): """bibupload - append mode, the input file should contain a record ID""" # We create create the record out of the xml marc recs = bibupload.xml_marc_to_records(self.test_to_append) # We call the function which should retrieve the record id rec_id = bibupload.retrieve_rec_id(recs[0], 'append') # We compare the value found with None self.assertEqual(self.test_recid, rec_id) # clean up after ourselves: def test_update_modification_record_date(self): """bibupload - append mode, checking the update of the modification date""" # Initialize the global variable # We create create the record out of the xml marc recs = bibupload.xml_marc_to_records(self.test_existing) # We call the function which should retrieve the record id rec_id = bibupload.retrieve_rec_id(recs[0], opt_mode='append') # Retrieve current localtime record_modification_date = time.localtime() # We update the modification date bibupload.update_bibrec_date(convert_datestruct_to_datetext(record_modification_date), rec_id, False) # We retrieve the modification date from the database query = """SELECT DATE_FORMAT(modification_date,'%%Y-%%m-%%d %%H:%%i:%%s') FROM bibrec where id = %s""" res = run_sql(query, (str(rec_id), )) # We compare the two results self.assertEqual(res[0][0], convert_datestruct_to_datetext(record_modification_date)) # clean up after ourselves: def test_append_complete_xml_marc(self): """bibupload - append mode, appending complete MARCXML file""" # Now we append a datafield # We create create the record out of the xml marc recs = bibupload.xml_marc_to_records(self.test_to_append) # We call the main function with the record as a parameter _, recid, _ = bibupload.bibupload_records(recs, opt_mode='append')[0] self.check_record_consistency(recid) # We retrieve the inserted xm after_append_xm = print_record(recid, 'xm') after_append_hm = print_record(recid, 'hm') # Compare if the two MARCXML are the same self.assertEqual(compare_xmbuffers(after_append_xm, self.test_expected_xm), '') self.assertEqual(compare_hmbuffers(after_append_hm, self.test_expected_hm), '') def test_retrieve_updated_005_tag(self): """bibupload - append mode, updating 005 control tag after modifiction """ recs = bibupload.xml_marc_to_records(self.test_to_append) _, recid, _ = bibupload.bibupload(recs[0], opt_mode='append') self.check_record_consistency(recid) rec = get_record(recid) query = """SELECT DATE_FORMAT(MAX(job_date),'%%Y%%m%%d%%H%%i%%s') FROM hstRECORD where id_bibrec = %s""" res = run_sql(query, (str(recid), )) self.assertEqual(str(res[0][0])+'.0',record_get_field_value(rec,'005','','')) class BibUploadCorrectModeTest(GenericBibUploadTest): """ Testing correcting a record containing similar tags (identical tag, different indicators). Currently Invenio replaces only those tags that have matching indicators too, unlike ALEPH500 that does not pay attention to indicators, it corrects all fields with the same tag, regardless of the indicator values. """ def setUp(self): """Initialize the MARCXML test record.""" GenericBibUploadTest.setUp(self) self.testrec1_xm = """ 123456789 SzGeCERN Test, Jane Test Institute Test, John Test University Cool Test, Jim Test Laboratory """ self.testrec1_hm = """ 001__ 123456789 003__ SzGeCERN 100__ $$aTest, Jane$$uTest Institute 10047 $$aTest, John$$uTest University 10048 $$aCool 10047 $$aTest, Jim$$uTest Laboratory """ self.testrec1_xm_to_correct = """ 123456789 Test, Joseph Test Academy Test2, Joseph Test2 Academy """ self.testrec1_corrected_xm = """ 123456789 SzGeCERN Test, Jane Test Institute Cool Test, Joseph Test Academy Test2, Joseph Test2 Academy """ self.testrec1_corrected_hm = """ 001__ 123456789 003__ SzGeCERN 100__ $$aTest, Jane$$uTest Institute 10048 $$aCool 10047 $$aTest, Joseph$$uTest Academy 10047 $$aTest2, Joseph$$uTest2 Academy """ # insert test record: test_record_xm = self.testrec1_xm.replace('123456789', '') recs = bibupload.xml_marc_to_records(test_record_xm) _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recID: self.testrec1_xm = self.testrec1_xm.replace('123456789', str(recid)) self.testrec1_hm = self.testrec1_hm.replace('123456789', str(recid)) self.testrec1_xm_to_correct = self.testrec1_xm_to_correct.replace('123456789', str(recid)) self.testrec1_corrected_xm = self.testrec1_corrected_xm.replace('123456789', str(recid)) self.testrec1_corrected_hm = self.testrec1_corrected_hm.replace('123456789', str(recid)) # test of the inserted record: inserted_xm = print_record(recid, 'xm') inserted_hm = print_record(recid, 'hm') self.assertEqual(compare_xmbuffers(inserted_xm, self.testrec1_xm), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.testrec1_hm), '') def test_record_correction(self): """bibupload - correct mode, similar MARCXML tags/indicators""" # correct some tags: recs = bibupload.xml_marc_to_records(self.testrec1_xm_to_correct) _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='correct')[0] self.check_record_consistency(self.recid) corrected_xm = print_record(self.recid, 'xm') corrected_hm = print_record(self.recid, 'hm') # did it work? self.assertEqual(compare_xmbuffers(corrected_xm, self.testrec1_corrected_xm), '') self.assertEqual(compare_hmbuffers(corrected_hm, self.testrec1_corrected_hm), '') # clean up after ourselves: return class BibUploadDeleteModeTest(GenericBibUploadTest): """ Testing deleting specific tags from a record while keeping anything else untouched. Currently Invenio deletes only those tags that have matching indicators too, unlike ALEPH500 that does not pay attention to indicators, it corrects all fields with the same tag, regardless of the indicator values. """ def setUp(self): """Initialize the MARCXML test record.""" GenericBibUploadTest.setUp(self) self.testrec1_xm = """ 123456789 SzGeCERN Test, Jane Test Institute Test, John Test University Cool Test, Jim Test Laboratory dumb text """ self.testrec1_hm = """ 001__ 123456789 003__ SzGeCERN 100__ $$aTest, Jane$$uTest Institute 10047 $$aTest, John$$uTest University 10048 $$aCool 10047 $$aTest, Jim$$uTest Laboratory 888__ $$adumb text """ self.testrec1_xm_to_delete = """ 123456789 Test, Jane Test Institute Test, Johnson Test University Cool dumb text """ self.testrec1_corrected_xm = """ 123456789 SzGeCERN Test, John Test University Test, Jim Test Laboratory """ self.testrec1_corrected_hm = """ 001__ 123456789 003__ SzGeCERN 10047 $$aTest, John$$uTest University 10047 $$aTest, Jim$$uTest Laboratory """ # insert test record: test_record_xm = self.testrec1_xm.replace('123456789', '') recs = bibupload.xml_marc_to_records(test_record_xm) _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recID: self.testrec1_xm = self.testrec1_xm.replace('123456789', str(recid)) self.testrec1_hm = self.testrec1_hm.replace('123456789', str(recid)) self.testrec1_xm_to_delete = self.testrec1_xm_to_delete.replace('123456789', str(recid)) self.testrec1_corrected_xm = self.testrec1_corrected_xm.replace('123456789', str(recid)) self.testrec1_corrected_hm = self.testrec1_corrected_hm.replace('123456789', str(recid)) # test of the inserted record: inserted_xm = print_record(recid, 'xm') inserted_hm = print_record(recid, 'hm') self.assertEqual(compare_xmbuffers(inserted_xm, self.testrec1_xm), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.testrec1_hm), '') # Checking dumb text is in bibxxx self.failUnless(run_sql("SELECT id_bibrec from bibrec_bib88x WHERE id_bibrec=%s", (recid, ))) def test_record_tags_deletion(self): """bibupload - delete mode, deleting specific tags""" # correct some tags: recs = bibupload.xml_marc_to_records(self.testrec1_xm_to_delete) _, recid, _ = bibupload.bibupload_records(recs, opt_mode='delete')[0] self.check_record_consistency(recid) corrected_xm = print_record(recid, 'xm') corrected_hm = print_record(recid, 'hm') # did it work? self.assertEqual(compare_xmbuffers(corrected_xm, self.testrec1_corrected_xm), '') self.assertEqual(compare_hmbuffers(corrected_hm, self.testrec1_corrected_hm), '') # Checking dumb text is no more in bibxxx self.failIf(run_sql("SELECT id_bibrec from bibrec_bib88x WHERE id_bibrec=%s", (recid, ))) # clean up after ourselves: class BibUploadReplaceModeTest(GenericBibUploadTest): """Testing replace mode.""" def test_record_replace(self): """bibupload - replace mode, similar MARCXML tags/indicators""" # replace some tags: testrec1_xm = """ 123456789 SzGeCERN Test, Jane Test Institute Test, John Test University Cool Test, Jim Test Laboratory """ testrec1_hm = """ 001__ 123456789 003__ SzGeCERN 100__ $$aTest, Jane$$uTest Institute 10047 $$aTest, John$$uTest University 10048 $$aCool 10047 $$aTest, Jim$$uTest Laboratory """ testrec1_xm_to_replace = """ 123456789 Test, Joseph Test Academy Test2, Joseph Test2 Academy """ testrec1_replaced_xm = """ 123456789 Test, Joseph Test Academy Test2, Joseph Test2 Academy """ testrec1_replaced_hm = """ 001__ 123456789 10047 $$aTest, Joseph$$uTest Academy 10047 $$aTest2, Joseph$$uTest2 Academy """ # insert test record: test_record_xm = testrec1_xm.replace('123456789', '') recs = bibupload.xml_marc_to_records(test_record_xm) _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recID: testrec1_xm = testrec1_xm.replace('123456789', str(recid)) testrec1_hm = testrec1_hm.replace('123456789', str(recid)) testrec1_xm_to_replace = testrec1_xm_to_replace.replace('123456789', str(recid)) testrec1_replaced_xm = testrec1_replaced_xm.replace('123456789', str(recid)) testrec1_replaced_hm = testrec1_replaced_hm.replace('123456789', str(recid)) # test of the inserted record: inserted_xm = print_record(recid, 'xm') inserted_hm = print_record(recid, 'hm') self.assertEqual(compare_xmbuffers(inserted_xm, testrec1_xm), '') self.assertEqual(compare_hmbuffers(inserted_hm, testrec1_hm), '') recs = bibupload.xml_marc_to_records(testrec1_xm_to_replace) _, recid, _ = bibupload.bibupload(recs[0], opt_mode='replace') self.check_record_consistency(recid) replaced_xm = print_record(recid, 'xm') replaced_hm = print_record(recid, 'hm') # did it work? self.assertEqual(compare_xmbuffers(replaced_xm, testrec1_replaced_xm), '') self.assertEqual(compare_hmbuffers(replaced_hm, testrec1_replaced_hm), '') def test_record_replace_force_non_existing(self): """bibupload - replace mode, force non existing recid""" # replace some tags: the_recid = self.last_recid + 1 testrec1_xm = """ %s SzGeCERN Test, Jane Test Institute Test, John Test University Cool Test, Jim Test Laboratory """ % the_recid testrec1_hm = """ 001__ %s 003__ SzGeCERN 100__ $$aTest, Jane$$uTest Institute 10047 $$aTest, John$$uTest University 10048 $$aCool 10047 $$aTest, Jim$$uTest Laboratory """ % the_recid recs = bibupload.xml_marc_to_records(testrec1_xm) task_set_option('force', True) try: err, recid, msg = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.check_record_consistency(recid) finally: task_set_option('force', False) replaced_xm = print_record(recid, 'xm') replaced_hm = print_record(recid, 'hm') # did it work? self.assertEqual(compare_xmbuffers(replaced_xm, testrec1_xm), '') self.assertEqual(compare_hmbuffers(replaced_hm, testrec1_hm), '') self.assertEqual(recid, the_recid) def test_record_replace_non_existing(self): """bibupload - replace mode, non existing recid""" # replace some tags: the_recid = self.last_recid + 1 testrec1_xm = """ %s SzGeCERN Test, Jane Test Institute Test, John Test University Cool Test, Jim Test Laboratory """ % the_recid recs = bibupload.xml_marc_to_records(testrec1_xm) err, recid, _ = bibupload.bibupload(recs[0], opt_mode='replace') self.assertEqual((err, recid), (1, -1)) def test_record_replace_two_recids(self): """bibupload - replace mode, two recids""" # replace some tags: testrec1_xm = """ 300 305 SzGeCERN Test, Jane Test Institute Test, John Test University Cool Test, Jim Test Laboratory """ recs = bibupload.xml_marc_to_records(testrec1_xm) err, recid, _ = bibupload.bibupload(recs[0], opt_mode='replace') # did it work? self.assertEqual((err, recid), (1, -1)) class BibUploadReferencesModeTest(GenericBibUploadTest): """Testing references mode. NOTE: in the past this was done by calling bibupload --reference|-z which is now simply implying bibupload --correct. """ def setUp(self): """Initialize the MARCXML variable""" GenericBibUploadTest.setUp(self) self.test_insert = """ 123456789 Tester, T CERN """ self.test_reference = """ 123456789 M. Lüscher and P. Weisz, String excitation energies in SU(N) gauge theories beyond the free-string approximation, J. High Energy Phys. 07 (2004) 014 """ self.test_reference_expected_xm = """ 123456789 Tester, T CERN M. Lüscher and P. Weisz, String excitation energies in SU(N) gauge theories beyond the free-string approximation, J. High Energy Phys. 07 (2004) 014 """ self.test_insert_hm = """ 001__ 123456789 100__ $$aTester, T$$uCERN """ self.test_reference_expected_hm = """ 001__ 123456789 100__ $$aTester, T$$uCERN %(reference_tag)sC5 $$mM. Lüscher and P. Weisz, String excitation energies in SU(N) gauge theories beyond the free-string approximation,$$sJ. High Energy Phys. 07 (2004) 014 """ % {'reference_tag': CFG_BIBUPLOAD_REFERENCE_TAG} # insert test record: test_insert = self.test_insert.replace('123456789', '') recs = bibupload.xml_marc_to_records(test_insert) _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recID: self.test_insert = self.test_insert.replace('123456789', str(recid)) self.test_insert_hm = self.test_insert_hm.replace('123456789', str(recid)) self.test_reference = self.test_reference.replace('123456789', str(recid)) self.test_reference_expected_xm = self.test_reference_expected_xm.replace('123456789', str(recid)) self.test_reference_expected_hm = self.test_reference_expected_hm.replace('123456789', str(recid)) # test of the inserted record: inserted_xm = print_record(recid, 'xm') inserted_hm = print_record(recid, 'hm') self.assertEqual(compare_xmbuffers(inserted_xm, self.test_insert), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.test_insert_hm), '') self.test_recid = recid def test_reference_complete_xml_marc(self): """bibupload - reference mode, inserting references MARCXML file""" # We create create the record out of the xml marc recs = bibupload.xml_marc_to_records(self.test_reference) # We call the main function with the record as a parameter dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='reference')[0] self.check_record_consistency(recid) # We retrieve the inserted xml reference_xm = print_record(recid, 'xm') reference_hm = print_record(recid, 'hm') # Compare if the two MARCXML are the same self.assertEqual(compare_xmbuffers(reference_xm, self.test_reference_expected_xm), '') self.assertEqual(compare_hmbuffers(reference_hm, self.test_reference_expected_hm), '') class BibUploadRecordsWithSYSNOTest(GenericBibUploadTest): """Testing uploading of records that have external SYSNO present.""" def setUp(self): # pylint: disable=C0103 """Initialize the MARCXML test records.""" GenericBibUploadTest.setUp(self) # Note that SYSNO fields are repeated but with different # subfields, this is to test whether bibupload would not # mistakenly pick up wrong values. self.xm_testrec1 = """ 123456789 SzGeCERN Bar, Baz Foo On the quux and huux 1 sysno1 sysno2 """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3], 'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or " ", 'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or " ", 'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6], } self.hm_testrec1 = """ 001__ 123456789 003__ SzGeCERN 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 1 %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$%(sysnosubfieldcode)ssysno1 %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$0sysno2 """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3], 'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4], 'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5], 'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6], } self.xm_testrec1_to_update = """ SzGeCERN Bar, Baz Foo On the quux and huux 1 Updated sysno1 sysno2 """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3], 'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or " ", 'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or " ", 'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6], } self.xm_testrec1_updated = """ 123456789 SzGeCERN Bar, Baz Foo On the quux and huux 1 Updated sysno1 sysno2 """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3], 'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or " ", 'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or " ", 'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6], } self.hm_testrec1_updated = """ 001__ 123456789 003__ SzGeCERN 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 1 Updated %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$%(sysnosubfieldcode)ssysno1 %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$0sysno2 """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3], 'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4], 'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5], 'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6], } self.xm_testrec2 = """ 987654321 SzGeCERN Bar, Baz Foo On the quux and huux 2 sysno2 sysno1 """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3], 'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or " ", 'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or " ", 'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6], } self.hm_testrec2 = """ 001__ 987654321 003__ SzGeCERN 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 2 %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$%(sysnosubfieldcode)ssysno2 %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$0sysno1 """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3], 'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4], 'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5], 'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6], } def test_insert_the_same_sysno_record(self): """bibupload - SYSNO tag, refuse to insert the same SYSNO record""" # initialize bibupload mode: if self.verbose: print "test_insert_the_same_sysno_record() started" # insert record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # insert record 2 first time: testrec_to_insert_first = self.xm_testrec2.replace('987654321', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr2, recid2, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid2) inserted_xm = print_record(recid2, 'xm') inserted_hm = print_record(recid2, 'hm') # use real recID when comparing whether it worked: self.xm_testrec2 = self.xm_testrec2.replace('987654321', str(recid2)) self.hm_testrec2 = self.hm_testrec2.replace('987654321', str(recid2)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec2), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec2), '') # try to insert updated record 1, it should fail: recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update) dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.assertEqual(-1, recid1_updated) if self.verbose: print "test_insert_the_same_sysno_record() finished" def test_insert_or_replace_the_same_sysno_record(self): """bibupload - SYSNO tag, allow to insert or replace the same SYSNO record""" # initialize bibupload mode: if self.verbose: print "test_insert_or_replace_the_same_sysno_record() started" # insert/replace record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0] self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID in test buffers when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # try to insert/replace updated record 1, it should be okay: recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update) dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0] self.check_record_consistency(recid1_updated) inserted_xm = print_record(recid1_updated, 'xm') inserted_hm = print_record(recid1_updated, 'hm') self.assertEqual(recid1, recid1_updated) # use real recID in test buffers when comparing whether it worked: self.xm_testrec1_updated = self.xm_testrec1_updated.replace('123456789', str(recid1)) self.hm_testrec1_updated = self.hm_testrec1_updated.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1_updated), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1_updated), '') if self.verbose: print "test_insert_or_replace_the_same_sysno_record() finished" def test_replace_nonexisting_sysno_record(self): """bibupload - SYSNO tag, refuse to replace non-existing SYSNO record""" # initialize bibupload mode: if self.verbose: print "test_replace_nonexisting_sysno_record() started" # insert record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummy, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0] self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID in test buffers when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # try to replace record 2 it should fail: testrec_to_insert_first = self.xm_testrec2.replace('987654321', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummy, recid2, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.assertEqual(-1, recid2) if self.verbose: print "test_replace_nonexisting_sysno_record() finished" class BibUploadRecordsWithEXTOAIIDTest(GenericBibUploadTest): """Testing uploading of records that have external EXTOAIID present.""" def setUp(self): # pylint: disable=C0103 """Initialize the MARCXML test records.""" GenericBibUploadTest.setUp(self) # Note that EXTOAIID fields are repeated but with different # subfields, this is to test whether bibupload would not # mistakenly pick up wrong values. self.xm_testrec1 = """ 123456789 SzGeCERN extoaiid1 extoaisrc1 extoaiid2 Bar, Baz Foo On the quux and huux 1 """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], 'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or " ", 'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or " ", 'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6], 'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6], } self.hm_testrec1 = """ 001__ 123456789 003__ SzGeCERN %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$%(extoaisrcsubfieldcode)sextoaisrc1$$%(extoaiidsubfieldcode)sextoaiid1 %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$0extoaiid2 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 1 """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], 'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4], 'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5], 'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6], 'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6], } self.xm_testrec1_to_update = """ SzGeCERN extoaiid1 extoaisrc1 extoaiid2 Bar, Baz Foo On the quux and huux 1 Updated """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], 'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or " ", 'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or " ", 'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6], 'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6], } self.xm_testrec1_updated = """ 123456789 SzGeCERN extoaiid1 extoaisrc1 extoaiid2 Bar, Baz Foo On the quux and huux 1 Updated """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], 'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or " ", 'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or " ", 'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6], 'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6], } self.hm_testrec1_updated = """ 001__ 123456789 003__ SzGeCERN %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$%(extoaisrcsubfieldcode)sextoaisrc1$$%(extoaiidsubfieldcode)sextoaiid1 %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$0extoaiid2 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 1 Updated """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], 'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4], 'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5], 'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6], 'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6], } self.xm_testrec2 = """ 987654321 SzGeCERN extoaiid2 extoaisrc1 extoaiid1 Bar, Baz Foo On the quux and huux 2 """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], 'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or " ", 'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or " ", 'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6], 'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6], } self.hm_testrec2 = """ 001__ 987654321 003__ SzGeCERN %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$%(extoaisrcsubfieldcode)sextoaisrc1$$%(extoaiidsubfieldcode)sextoaiid2 %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$0extoaiid1 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 2 """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], 'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4], 'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5], 'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6], 'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6], } def test_insert_the_same_extoaiid_record(self): """bibupload - EXTOAIID tag, refuse to insert the same EXTOAIID record""" # initialize bibupload mode: if self.verbose: print "test_insert_the_same_extoaiid_record() started" # insert record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # insert record 2 first time: testrec_to_insert_first = self.xm_testrec2.replace('987654321', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr2, recid2, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid2) inserted_xm = print_record(recid2, 'xm') inserted_hm = print_record(recid2, 'hm') # use real recID when comparing whether it worked: self.xm_testrec2 = self.xm_testrec2.replace('987654321', str(recid2)) self.hm_testrec2 = self.hm_testrec2.replace('987654321', str(recid2)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec2), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec2), '') # try to insert updated record 1, it should fail: recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update) dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.assertEqual(-1, recid1_updated) if self.verbose: print "test_insert_the_same_extoaiid_record() finished" def test_insert_or_replace_the_same_extoaiid_record(self): """bibupload - EXTOAIID tag, allow to insert or replace the same EXTOAIID record""" # initialize bibupload mode: if self.verbose: print "test_insert_or_replace_the_same_extoaiid_record() started" # insert/replace record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0] self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID in test buffers when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # try to insert/replace updated record 1, it should be okay: recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update) dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0] self.check_record_consistency(recid1_updated) inserted_xm = print_record(recid1_updated, 'xm') inserted_hm = print_record(recid1_updated, 'hm') self.assertEqual(recid1, recid1_updated) # use real recID in test buffers when comparing whether it worked: self.xm_testrec1_updated = self.xm_testrec1_updated.replace('123456789', str(recid1)) self.hm_testrec1_updated = self.hm_testrec1_updated.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1_updated), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1_updated), '') if self.verbose: print "test_insert_or_replace_the_same_extoaiid_record() finished" def test_replace_nonexisting_extoaiid_record(self): """bibupload - EXTOAIID tag, refuse to replace non-existing EXTOAIID record""" # initialize bibupload mode: if self.verbose: print "test_replace_nonexisting_extoaiid_record() started" # insert record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0] self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID in test buffers when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # try to replace record 2 it should fail: testrec_to_insert_first = self.xm_testrec2.replace('987654321', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr2, recid2, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.assertEqual(-1, recid2) if self.verbose: print "test_replace_nonexisting_extoaiid_record() finished" class BibUploadRecordsWithOAIIDTest(GenericBibUploadTest): """Testing uploading of records that have OAI ID present.""" def setUp(self): """Initialize the MARCXML test records.""" GenericBibUploadTest.setUp(self) # Note that OAI fields are repeated but with different # subfields, this is to test whether bibupload would not # mistakenly pick up wrong values. self.xm_testrec1 = """ 123456789 SzGeCERN Bar, Baz Foo On the quux and huux 1 oai:foo:1 oai:foo:2 """ % {'oaitag': CFG_OAI_ID_FIELD[0:3], 'oaiind1': CFG_OAI_ID_FIELD[3:4] != "_" and \ CFG_OAI_ID_FIELD[3:4] or " ", 'oaiind2': CFG_OAI_ID_FIELD[4:5] != "_" and \ CFG_OAI_ID_FIELD[4:5] or " ", 'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6], } self.hm_testrec1 = """ 001__ 123456789 003__ SzGeCERN 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 1 %(oaitag)s%(oaiind1)s%(oaiind2)s $$%(oaisubfieldcode)soai:foo:1 %(oaitag)s%(oaiind1)s%(oaiind2)s $$0oai:foo:2 """ % {'oaitag': CFG_OAI_ID_FIELD[0:3], 'oaiind1': CFG_OAI_ID_FIELD[3:4], 'oaiind2': CFG_OAI_ID_FIELD[4:5], 'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6], } self.xm_testrec1_to_update = """ SzGeCERN Bar, Baz Foo On the quux and huux 1 Updated oai:foo:1 oai:foo:2 """ % {'oaitag': CFG_OAI_ID_FIELD[0:3], 'oaiind1': CFG_OAI_ID_FIELD[3:4] != "_" and \ CFG_OAI_ID_FIELD[3:4] or " ", 'oaiind2': CFG_OAI_ID_FIELD[4:5] != "_" and \ CFG_OAI_ID_FIELD[4:5] or " ", 'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6], } self.xm_testrec1_updated = """ 123456789 SzGeCERN Bar, Baz Foo On the quux and huux 1 Updated oai:foo:1 oai:foo:2 """ % {'oaitag': CFG_OAI_ID_FIELD[0:3], 'oaiind1': CFG_OAI_ID_FIELD[3:4] != "_" and \ CFG_OAI_ID_FIELD[3:4] or " ", 'oaiind2': CFG_OAI_ID_FIELD[4:5] != "_" and \ CFG_OAI_ID_FIELD[4:5] or " ", 'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6], } self.hm_testrec1_updated = """ 001__ 123456789 003__ SzGeCERN 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 1 Updated %(oaitag)s%(oaiind1)s%(oaiind2)s $$%(oaisubfieldcode)soai:foo:1 %(oaitag)s%(oaiind1)s%(oaiind2)s $$0oai:foo:2 """ % {'oaitag': CFG_OAI_ID_FIELD[0:3], 'oaiind1': CFG_OAI_ID_FIELD[3:4], 'oaiind2': CFG_OAI_ID_FIELD[4:5], 'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6], } self.xm_testrec2 = """ 987654321 SzGeCERN Bar, Baz Foo On the quux and huux 2 oai:foo:2 oai:foo:1 """ % {'oaitag': CFG_OAI_ID_FIELD[0:3], 'oaiind1': CFG_OAI_ID_FIELD[3:4] != "_" and \ CFG_OAI_ID_FIELD[3:4] or " ", 'oaiind2': CFG_OAI_ID_FIELD[4:5] != "_" and \ CFG_OAI_ID_FIELD[4:5] or " ", 'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6], } self.hm_testrec2 = """ 001__ 987654321 003__ SzGeCERN 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 2 %(oaitag)s%(oaiind1)s%(oaiind2)s $$%(oaisubfieldcode)soai:foo:2 %(oaitag)s%(oaiind1)s%(oaiind2)s $$0oai:foo:1 """ % {'oaitag': CFG_OAI_ID_FIELD[0:3], 'oaiind1': CFG_OAI_ID_FIELD[3:4], 'oaiind2': CFG_OAI_ID_FIELD[4:5], 'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6], } def test_insert_the_same_oai_record(self): """bibupload - OAIID tag, refuse to insert the same OAI record""" # insert record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # insert record 2 first time: testrec_to_insert_first = self.xm_testrec2.replace('987654321', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr2, recid2, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid2) inserted_xm = print_record(recid2, 'xm') inserted_hm = print_record(recid2, 'hm') # use real recID when comparing whether it worked: self.xm_testrec2 = self.xm_testrec2.replace('987654321', str(recid2)) self.hm_testrec2 = self.hm_testrec2.replace('987654321', str(recid2)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec2), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec2), '') # try to insert updated record 1, it should fail: recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update) dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.assertEqual(-1, recid1_updated) def test_insert_or_replace_the_same_oai_record(self): """bibupload - OAIID tag, allow to insert or replace the same OAI record""" # initialize bibupload mode: # insert/replace record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0] self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID in test buffers when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # try to insert/replace updated record 1, it should be okay: recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update) dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0] self.check_record_consistency(recid1_updated) inserted_xm = print_record(recid1_updated, 'xm') inserted_hm = print_record(recid1_updated, 'hm') self.assertEqual(recid1, recid1_updated) # use real recID in test buffers when comparing whether it worked: self.xm_testrec1_updated = self.xm_testrec1_updated.replace('123456789', str(recid1)) self.hm_testrec1_updated = self.hm_testrec1_updated.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1_updated), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1_updated), '') def test_replace_nonexisting_oai_record(self): """bibupload - OAIID tag, refuse to replace non-existing OAI record""" # insert record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0] self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID in test buffers when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # try to replace record 2 it should fail: testrec_to_insert_first = self.xm_testrec2.replace('987654321', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) dummyerr2, recid2, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.assertEqual(-1, recid2) class BibUploadRecordsWithDOITest(GenericBibUploadTest): """Testing uploading of records with DOI.""" def setUp(self): """Initialize the MARCXML test records.""" GenericBibUploadTest.setUp(self) self.xm_testrec1 = """ 123456789 SzGeCERN doi 10.5170/123-456-789 nondoi 10.5170/123-456-789-0 Bar, Baz Foo On the quux and huux 1 """ % {'doitag': '024', 'doiind1': '7', 'doiind2': ' ', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.hm_testrec1 = """ 001__ 123456789 003__ SzGeCERN %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)sdoi$$%(doisubfieldcodevalue)s10.5170/123-456-789 %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)snondoi$$%(doisubfieldcodevalue)s10.5170/123-456-789-0 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 1 """ % {'doitag': '024', 'doiind1': '7', 'doiind2': '_', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.xm_testrec1_to_update = """ SzGeCERN doi 10.5170/123-456-789 nondoi 10.5170/123-456-789-0 Bar, Baz Foo On the quux and huux 1 Updated """ % {'doitag': '024', 'doiind1': '7', 'doiind2': ' ', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.xm_testrec1_updated = """ 123456789 SzGeCERN doi 10.5170/123-456-789 nondoi 10.5170/123-456-789-0 Bar, Baz Foo On the quux and huux 1 Updated """ % {'doitag': '024', 'doiind1': '7', 'doiind2': ' ', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.hm_testrec1_updated = """ 001__ 123456789 003__ SzGeCERN %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)sdoi$$%(doisubfieldcodevalue)s10.5170/123-456-789 %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)snondoi$$%(doisubfieldcodevalue)s10.5170/123-456-789-0 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 1 Updated """ % {'doitag': '024', 'doiind1': '7', 'doiind2': '_', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.xm_testrec2 = """ 987654321 SzGeCERN doi 10.5170/987-654-321 Bar, Baz Foo On the quux and huux 2 """ % {'doitag': '024', 'doiind1': '7', 'doiind2': ' ', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.hm_testrec2 = """ 001__ 987654321 003__ SzGeCERN %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)sdoi$$%(doisubfieldcodevalue)s10.5170/987-654-321 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 2 """ % {'doitag': '024', 'doiind1': '7', 'doiind2': '_', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.xm_testrec2_to_update = """ 987654321 SzGeCERN doi 10.5170/123-456-789 Bar, Baz Foo """ % {'doitag': '024', 'doiind1': '7', 'doiind2': ' ', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.xm_testrec3 = """ 192837645 SzGeCERN doi 10.5170/123-456-789-0 Bar, Baz Foo On the quux and huux 4 """ % {'doitag': '024', 'doiind1': '7', 'doiind2': ' ', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.hm_testrec3 = """ 001__ 192837645 003__ SzGeCERN %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)sdoi$$%(doisubfieldcodevalue)s10.5170/123-456-789-0 100__ $$aBar, Baz$$uFoo 245__ $$aOn the quux and huux 4 """ % {'doitag': '024', 'doiind1': '7', 'doiind2': '_', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.xm_testrec4 = """ SzGeCERN doi 10.5170/123-456-789-non-existing Bar, Baz Foo On the quux and huux 5 """ % {'doitag': '024', 'doiind1': '7', 'doiind2': ' ', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } self.xm_testrec5 = """ 123456789 SzGeCERN doi 10.5170/123-456-789 doi 10.5170/987-654-321 Bar, Baz Foo On the quux and huux 6 """ % {'doitag': '024', 'doiind1': '7', 'doiind2': ' ', 'doisubfieldcodevalue': 'a', 'doisubfieldcodesource': '2' } def test_insert_the_same_doi_matching_on_doi(self): """bibupload - DOI tag, refuse to "insert" twice same DOI (matching on DOI)""" # insert record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # insert record 2 first time: testrec_to_insert_first = self.xm_testrec2.replace('987654321', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err2, recid2, msg2 = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid2) inserted_xm = print_record(recid2, 'xm') inserted_hm = print_record(recid2, 'hm') # use real recID when comparing whether it worked: self.xm_testrec2 = self.xm_testrec2.replace('987654321', str(recid2)) self.hm_testrec2 = self.hm_testrec2.replace('987654321', str(recid2)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec2), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec2), '') # try to insert again record 1 (without recid, matching on DOI) testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='insert') self.assertEqual(-1, recid1_updated) # if we try to update, append or correct, the same record is matched recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update) err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='correct') self.check_record_consistency(recid1_updated) self.assertEqual(recid1, recid1_updated) err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='append') self.check_record_consistency(recid1_updated) self.assertEqual(recid1, recid1_updated) err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='replace') self.check_record_consistency(recid1_updated) self.assertEqual(recid1, recid1_updated) def test_insert_the_same_doi_matching_on_recid(self): """bibupload - DOI tag, refuse to "insert" twice same DOI (matching on recid)""" # First upload 2 test records testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid1) testrec_to_insert_first = self.xm_testrec2.replace('987654321', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err2, recid2, msg2 = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid2) # try to update record 2 with DOI already in record 1. It must fail: testrec_to_update = self.xm_testrec2_to_update.replace('987654321', '%s' % recid2) recs = bibupload.xml_marc_to_records(testrec_to_update) err, recid, msg = bibupload.bibupload(recs[0], opt_mode='replace') self.check_record_consistency(recid) self.assertEqual(1, err) # Ditto in correct and append mode recs = bibupload.xml_marc_to_records(testrec_to_update) err, recid, msg = bibupload.bibupload(recs[0], opt_mode='correct') self.check_record_consistency(recid) self.assertEqual(1, err) recs = bibupload.xml_marc_to_records(testrec_to_update) err, recid, msg = bibupload.bibupload(recs[0], opt_mode='append') self.check_record_consistency(recid) self.assertEqual(1, err) def test_insert_or_replace_the_same_doi_record(self): """bibupload - DOI tag, allow to insert or replace matching on DOI""" # insert/replace record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='replace_or_insert') self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID in test buffers when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # try to insert/replace updated record 1, it should be okay: recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update) err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='replace_or_insert') self.check_record_consistency(recid1_updated) inserted_xm = print_record(recid1_updated, 'xm') inserted_hm = print_record(recid1_updated, 'hm') self.assertEqual(recid1, recid1_updated) # use real recID in test buffers when comparing whether it worked: self.xm_testrec1_updated = self.xm_testrec1_updated.replace('123456789', str(recid1)) self.hm_testrec1_updated = self.hm_testrec1_updated.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1_updated), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1_updated), '') def test_correct_the_same_doi_record(self): """bibupload - DOI tag, allow to correct matching on DOI""" # insert/replace record 1 first time: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='replace_or_insert') self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID in test buffers when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # try to correct updated record 1, it should be okay: recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update) err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='correct') self.check_record_consistency(recid1_updated) inserted_xm = print_record(recid1_updated, 'xm') inserted_hm = print_record(recid1_updated, 'hm') self.assertEqual(recid1, recid1_updated) # use real recID in test buffers when comparing whether it worked: self.xm_testrec1_updated = self.xm_testrec1_updated.replace('123456789', str(recid1)) self.hm_testrec1_updated = self.hm_testrec1_updated.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1_updated), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1_updated), '') def test_replace_nonexisting_doi_record(self): """bibupload - DOI tag, refuse to replace non-existing DOI record (matching on DOI)""" testrec_to_insert_first = self.xm_testrec4 recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err4, recid4, msg4 = bibupload.bibupload(recs[0], opt_mode='replace') self.assertEqual(-1, recid4) def test_matching_on_doi_source_field(self): """bibupload - DOI tag, test matching records using DOI value AND source field ($2)""" # insert test record 1, with a "fake" doi (not "doi" in source field): testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid1) inserted_xm = print_record(recid1, 'xm') inserted_hm = print_record(recid1, 'hm') # use real recID when comparing whether it worked: self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1)) self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec1), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec1), '') # insert record 3, which matches record 1 "fake" doi, so it # should work. testrec_to_insert_first = self.xm_testrec3.replace('192837645', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err3, recid3, msg3 = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid3) inserted_xm = print_record(recid3, 'xm') inserted_hm = print_record(recid3, 'hm') # use real recID when comparing whether it worked: self.xm_testrec3 = self.xm_testrec3.replace('192837645', str(recid3)) self.hm_testrec3 = self.hm_testrec3.replace('192837645', str(recid3)) self.assertEqual(compare_xmbuffers(inserted_xm, self.xm_testrec3), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.hm_testrec3), '') def test_replace_or_update_record__with_ambiguous_doi(self): """bibupload - DOI tag, refuse to replace/correct/append on the basis of ambiguous DOI""" # First upload 2 test records with two different DOIs: testrec_to_insert_first = self.xm_testrec1.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid1) self.assertEqual(0, err1) testrec_to_insert_first = self.xm_testrec2.replace('987654321', '') recs = bibupload.xml_marc_to_records(testrec_to_insert_first) err2, recid2, msg2 = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid2) self.assertEqual(0, err2) # Now try to insert record with DOIs matching the records # previously uploaded. It must fail. testrec = self.xm_testrec5.replace('123456789', '') recs = bibupload.xml_marc_to_records(testrec) err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='insert') self.assertEqual(1, err5) # Ditto for other modes: recs = bibupload.xml_marc_to_records(testrec) err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='replace_or_insert') self.assertEqual(1, err5) recs = bibupload.xml_marc_to_records(testrec) err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='replace') self.assertEqual(1, err5) recs = bibupload.xml_marc_to_records(testrec) err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='correct') self.assertEqual(1, err5) recs = bibupload.xml_marc_to_records(testrec) err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='append') self.assertEqual(1, err5) # The same is true if a recid exists in the input MARCXML (as # long as DOIs are ambiguous): testrec = self.xm_testrec5.replace('123456789', '%s' % recid1) recs = bibupload.xml_marc_to_records(testrec) err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='replace_or_insert') self.assertEqual(1, err5) recs = bibupload.xml_marc_to_records(testrec) err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='replace') self.assertEqual(1, err5) recs = bibupload.xml_marc_to_records(testrec) err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='correct') self.assertEqual(1, err5) recs = bibupload.xml_marc_to_records(testrec) err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='append') self.assertEqual(1, err5) class BibUploadIndicatorsTest(GenericBibUploadTest): """ Testing uploading of a MARCXML record with indicators having either blank space (as per MARC schema) or empty string value (old behaviour). """ def setUp(self): """Initialize the MARCXML test record.""" GenericBibUploadTest.setUp(self) self.testrec1_xm = """ SzGeCERN Test, John Test University """ self.testrec1_hm = """ 003__ SzGeCERN 100__ $$aTest, John$$uTest University """ self.testrec2_xm = """ SzGeCERN Test, John Test University """ self.testrec2_hm = """ 003__ SzGeCERN 100__ $$aTest, John$$uTest University """ def test_record_with_spaces_in_indicators(self): """bibupload - inserting MARCXML with spaces in indicators""" recs = bibupload.xml_marc_to_records(self.testrec1_xm) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) inserted_xm = print_record(recid, 'xm') inserted_hm = print_record(recid, 'hm') self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(inserted_xm), self.testrec1_xm), '') self.assertEqual(compare_hmbuffers(remove_tag_001_from_hmbuffer(inserted_hm), self.testrec1_hm), '') def test_record_with_no_spaces_in_indicators(self): """bibupload - inserting MARCXML with no spaces in indicators""" recs = bibupload.xml_marc_to_records(self.testrec2_xm) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) inserted_xm = print_record(recid, 'xm') inserted_hm = print_record(recid, 'hm') self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(inserted_xm), self.testrec2_xm), '') self.assertEqual(compare_hmbuffers(remove_tag_001_from_hmbuffer(inserted_hm), self.testrec2_hm), '') class BibUploadUpperLowerCaseTest(GenericBibUploadTest): """ Testing treatment of similar records with only upper and lower case value differences in the bibxxx table. """ def setUp(self): """Initialize the MARCXML test records.""" GenericBibUploadTest.setUp(self) self.testrec1_xm = """ SzGeCERN Test, John Test University """ self.testrec1_hm = """ 003__ SzGeCERN 100__ $$aTest, John$$uTest University """ self.testrec2_xm = """ SzGeCERN TeSt, JoHn Test UniVeRsity """ self.testrec2_hm = """ 003__ SzGeCERN 100__ $$aTeSt, JoHn$$uTest UniVeRsity """ def test_record_with_upper_lower_case_letters(self): """bibupload - inserting similar MARCXML records with upper/lower case""" # insert test record #1: recs = bibupload.xml_marc_to_records(self.testrec1_xm) dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid1) recid1_inserted_xm = print_record(recid1, 'xm') recid1_inserted_hm = print_record(recid1, 'hm') # insert test record #2: recs = bibupload.xml_marc_to_records(self.testrec2_xm) dummyerr1, recid2, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid2) recid2_inserted_xm = print_record(recid2, 'xm') recid2_inserted_hm = print_record(recid2, 'hm') # let us compare stuff now: self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(recid1_inserted_xm), self.testrec1_xm), '') self.assertEqual(compare_hmbuffers(remove_tag_001_from_hmbuffer(recid1_inserted_hm), self.testrec1_hm), '') self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(recid2_inserted_xm), self.testrec2_xm), '') self.assertEqual(compare_hmbuffers(remove_tag_001_from_hmbuffer(recid2_inserted_hm), self.testrec2_hm), '') class BibUploadControlledProvenanceTest(GenericBibUploadTest): """Testing treatment of tags under controlled provenance in the correct mode.""" def setUp(self): """Initialize the MARCXML test record.""" GenericBibUploadTest.setUp(self) self.testrec1_xm = """ 123456789 SzGeCERN Test, Jane Test Institute Test title blabla sam blublu sim human """ self.testrec1_hm = """ 001__ 123456789 003__ SzGeCERN 100__ $$aTest, Jane$$uTest Institute 245__ $$aTest title 6531_ $$9sam$$ablabla 6531_ $$9sim$$ablublu 6531_ $$ahuman """ self.testrec1_xm_to_correct = """ 123456789 bleble sim bloblo som """ self.testrec1_corrected_xm = """ 123456789 SzGeCERN Test, Jane Test Institute Test title blabla sam human bleble sim bloblo som """ self.testrec1_corrected_hm = """ 001__ 123456789 003__ SzGeCERN 100__ $$aTest, Jane$$uTest Institute 245__ $$aTest title 6531_ $$9sam$$ablabla 6531_ $$ahuman 6531_ $$9sim$$ableble 6531_ $$9som$$abloblo """ # insert test record: test_record_xm = self.testrec1_xm.replace('123456789', '') recs = bibupload.xml_marc_to_records(test_record_xm) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recID: self.testrec1_xm = self.testrec1_xm.replace('123456789', str(recid)) self.testrec1_hm = self.testrec1_hm.replace('123456789', str(recid)) self.testrec1_xm_to_correct = self.testrec1_xm_to_correct.replace('123456789', str(recid)) self.testrec1_corrected_xm = self.testrec1_corrected_xm.replace('123456789', str(recid)) self.testrec1_corrected_hm = self.testrec1_corrected_hm.replace('123456789', str(recid)) # test of the inserted record: inserted_xm = print_record(recid, 'xm') inserted_hm = print_record(recid, 'hm') self.assertEqual(compare_xmbuffers(inserted_xm, self.testrec1_xm), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.testrec1_hm), '') def test_controlled_provenance_persistence(self): """bibupload - correct mode, tags with controlled provenance""" # correct metadata tags; will the protected tags be kept? recs = bibupload.xml_marc_to_records(self.testrec1_xm_to_correct) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='correct')[0] self.check_record_consistency(recid) corrected_xm = print_record(recid, 'xm') corrected_hm = print_record(recid, 'hm') # did it work? self.assertEqual(compare_xmbuffers(corrected_xm, self.testrec1_corrected_xm), '') self.assertEqual(compare_hmbuffers(corrected_hm, self.testrec1_corrected_hm), '') class BibUploadStrongTagsTest(GenericBibUploadTest): """Testing treatment of strong tags and the replace mode.""" def setUp(self): """Initialize the MARCXML test record.""" GenericBibUploadTest.setUp(self) self.testrec1_xm = """ 123456789 SzGeCERN Test, Jane Test Institute Test title A value Another value """ % {'strong_tag': bibupload.CFG_BIBUPLOAD_STRONG_TAGS[0]} self.testrec1_hm = """ 001__ 123456789 003__ SzGeCERN 100__ $$aTest, Jane$$uTest Institute 245__ $$aTest title %(strong_tag)s__ $$aA value$$bAnother value """ % {'strong_tag': bibupload.CFG_BIBUPLOAD_STRONG_TAGS[0]} self.testrec1_xm_to_replace = """ 123456789 Test, Joseph Test Academy """ self.testrec1_replaced_xm = """ 123456789 Test, Joseph Test Academy A value Another value """ % {'strong_tag': bibupload.CFG_BIBUPLOAD_STRONG_TAGS[0]} self.testrec1_replaced_hm = """ 001__ 123456789 100__ $$aTest, Joseph$$uTest Academy %(strong_tag)s__ $$aA value$$bAnother value """ % {'strong_tag': bibupload.CFG_BIBUPLOAD_STRONG_TAGS[0]} # insert test record: test_record_xm = self.testrec1_xm.replace('123456789', '') recs = bibupload.xml_marc_to_records(test_record_xm) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recID: self.testrec1_xm = self.testrec1_xm.replace('123456789', str(recid)) self.testrec1_hm = self.testrec1_hm.replace('123456789', str(recid)) self.testrec1_xm_to_replace = self.testrec1_xm_to_replace.replace('123456789', str(recid)) self.testrec1_replaced_xm = self.testrec1_replaced_xm.replace('123456789', str(recid)) self.testrec1_replaced_hm = self.testrec1_replaced_hm.replace('123456789', str(recid)) # test of the inserted record: inserted_xm = print_record(recid, 'xm') inserted_hm = print_record(recid, 'hm') self.assertEqual(compare_xmbuffers(inserted_xm, self.testrec1_xm), '') self.assertEqual(compare_hmbuffers(inserted_hm, self.testrec1_hm), '') def test_strong_tags_persistence(self): """bibupload - strong tags, persistence in replace mode""" # replace all metadata tags; will the strong tags be kept? recs = bibupload.xml_marc_to_records(self.testrec1_xm_to_replace) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='replace')[0] self.check_record_consistency(recid) replaced_xm = print_record(recid, 'xm') replaced_hm = print_record(recid, 'hm') # did it work? self.assertEqual(compare_xmbuffers(replaced_xm, self.testrec1_replaced_xm), '') self.assertEqual(compare_hmbuffers(replaced_hm, self.testrec1_replaced_hm), '') class BibUploadPretendTest(GenericBibUploadTest): """ Testing bibupload --pretend correctness. """ def setUp(self): GenericBibUploadTest.setUp(self) self.demo_data = bibupload.xml_marc_to_records(open(os.path.join(CFG_TMPDIR, 'demobibdata.xml')).read())[0] self.before = self._get_tables_fingerprint() task_set_task_param('pretend', True) def tearDown(self): GenericBibUploadTest.tearDown(self) task_set_task_param('pretend', False) @staticmethod def _get_tables_fingerprint(): """ Take lenght and last modification time of all the tables that might be touched by bibupload and return them in a nice structure. """ fingerprint = {} tables = ['bibrec', 'bibdoc', 'bibrec_bibdoc', 'bibdoc_bibdoc', 'bibfmt', 'hstDOCUMENT', 'hstRECORD', 'bibHOLDINGPEN', 'bibdocmoreinfo', 'bibdocfsinfo'] for i in xrange(100): tables.append('bib%02dx' % i) tables.append('bibrec_bib%02dx' % i) for table in tables: fingerprint[table] = get_table_status_info(table) return fingerprint @staticmethod def _checks_tables_fingerprints(before, after): """ Checks differences in table_fingerprints. """ for table in before.keys(): if before[table] != after[table]: raise StandardError("Table %s has been modified: before was [%s], after was [%s]" % (table, pprint.pformat(before[table]), pprint.pformat(after[table]))) return True def test_pretend_insert(self): """bibupload - pretend insert""" bibupload.bibupload_records([self.demo_data], opt_mode='insert', pretend=True) self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint())) def test_pretend_correct(self): """bibupload - pretend correct""" bibupload.bibupload_records([self.demo_data], opt_mode='correct', pretend=True) self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint())) def test_pretend_replace(self): """bibupload - pretend replace""" bibupload.bibupload_records([self.demo_data], opt_mode='replace', pretend=True) self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint())) def test_pretend_append(self): """bibupload - pretend append""" bibupload.bibupload_records([self.demo_data], opt_mode='append', pretend=True) self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint())) def test_pretend_replace_or_insert(self): """bibupload - pretend replace or insert""" bibupload.bibupload_records([self.demo_data], opt_mode='replace_or_insert', pretend=True) self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint())) def test_pretend_holdingpen(self): """bibupload - pretend holdingpen""" bibupload.bibupload_records([self.demo_data], opt_mode='holdingpen', pretend=True) self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint())) def test_pretend_delete(self): """bibupload - pretend delete""" bibupload.bibupload_records([self.demo_data], opt_mode='delete', pretend=True) self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint())) def test_pretend_reference(self): """bibupload - pretend reference""" bibupload.bibupload_records([self.demo_data], opt_mode='reference', pretend=True) self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint())) class BibUploadHoldingPenTest(GenericBibUploadTest): """ Testing the Holding Pen usage. """ def setUp(self): GenericBibUploadTest.setUp(self) self.verbose = 9 setup_loggers() task_set_task_param('verbose', self.verbose) self.recid = 10 self.oai_id = "oai:cds.cern.ch:CERN-EP-2001-094" def test_holding_pen_upload_with_recid(self): """bibupload - holding pen upload with recid""" test_to_upload = """ %s Kleefeld, F Newcomer, Y Rupp, G Scadron, M D """ % self.recid recs = bibupload.xml_marc_to_records(test_to_upload) bibupload.insert_record_into_holding_pen(recs[0], "") res = run_sql("SELECT changeset_xml FROM bibHOLDINGPEN WHERE id_bibrec=%s", (self.recid, )) self.failUnless("Rupp, G" in zlib.decompress(res[0][0])) def test_holding_pen_upload_with_oai_id(self): """bibupload - holding pen upload with oai_id""" test_to_upload = """ Kleefeld, F Newcomer, Y Rupp, G Scadron, M D %(value)s """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], 'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or " ", 'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or " ", 'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6], 'value': self.oai_id } recs = bibupload.xml_marc_to_records(test_to_upload) bibupload.insert_record_into_holding_pen(recs[0], self.oai_id) res = run_sql("SELECT changeset_xml FROM bibHOLDINGPEN WHERE id_bibrec=%s AND oai_id=%s", (self.recid, self.oai_id)) self.failUnless("Rupp, G" in zlib.decompress(res[0][0])) def tearDown(self): GenericBibUploadTest.tearDown(self) run_sql("DELETE FROM bibHOLDINGPEN WHERE id_bibrec=%s", (self.recid, )) class BibUploadFFTModeTest(GenericBibUploadTest): """ Testing treatment of fulltext file transfer import mode. """ def _test_bibdoc_status(self, recid, docname, status): res = run_sql('SELECT bd.status FROM bibrec_bibdoc as bb JOIN bibdoc as bd ON bb.id_bibdoc = bd.id WHERE bb.id_bibrec = %s AND bb.docname = %s', (recid, docname)) self.failUnless(res) self.assertEqual(status, res[0][0]) def test_writing_rights(self): """bibupload - FFT has writing rights""" self.failUnless(bibupload.writing_rights_p()) def test_simple_fft_insert(self): """bibupload - simple FFT insert""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self.failUnless(try_url_download(testrec_expected_url)) def test_fft_insert_with_valid_embargo(self): """bibupload - FFT insert with valid embargo""" # define the test case: future_date = time.strftime('%Y-%m-%d', time.gmtime(time.time() + 24 * 3600 * 2)) test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif firerole: deny until '%(future_date)s' allow any """ % { 'future_date': future_date, 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) result = urlopen(testrec_expected_url).read() self.failUnless("This file is restricted." in result, result) def test_fft_insert_with_expired_embargo(self): """bibupload - FFT insert with expired embargo""" # define the test case: past_date = time.strftime('%Y-%m-%d', time.gmtime(time.time() - 24 * 3600 * 2)) test_to_upload = """ SzGeCERN Test, John Test University ARTICLE %(siteurl)s/img/site_logo.gif firerole: deny until '%(past_date)s' allow any """ % { 'past_date': past_date, 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif ARTICLE """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) result = urlopen(testrec_expected_url).read() self.failIf("If you already have an account, please login using the form below." in result, result) self.assertEqual(test_web_page_content(testrec_expected_url, 'hyde', 'h123yde', expected_text='Authorization failure'), []) self.force_webcoll(recid) self.assertEqual(test_web_page_content(testrec_expected_url, 'hyde', 'h123yde', expected_text=urlopen("%(siteurl)s/img/site_logo.gif" % { 'siteurl': CFG_SITE_URL }).read()), []) def test_exotic_format_fft_append(self): """bibupload - exotic format FFT append""" # define the test case: testfile = os.path.join(CFG_TMPDIR, 'test.ps.Z') open(testfile, 'w').write('TEST') email_tag = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][0:3] email_ind1 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][3] email_ind2 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][4] email_code = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][5] test_to_upload = """ SzGeCERN Test, John Test University jekyll@cds.cern.ch """ % { 'email_tag': email_tag, 'email_ind1': email_ind1 == '_' and ' ' or email_ind1, 'email_ind2': email_ind2 == '_' and ' ' or email_ind2, 'email_code': email_code} testrec_to_append = """ 123456789 %s """ % testfile testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University jekyll@cds.cern.ch 4 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/test.ps.Z """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, 'email_tag': email_tag, 'email_ind1': email_ind1 == '_' and ' ' or email_ind1, 'email_ind2': email_ind2 == '_' and ' ' or email_ind2, 'email_code': email_code} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/test.ps.Z" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url2 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/test?format=ps.Z" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_to_append = testrec_to_append.replace('123456789', str(recid)) testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) testrec_expected_url2 = testrec_expected_url.replace('123456789', str(recid)) recs = bibupload.xml_marc_to_records(testrec_to_append) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='append')[0] self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self.assertEqual(test_web_page_content(testrec_expected_url, 'jekyll', 'j123ekyll', expected_text='TEST'), []) self.assertEqual(test_web_page_content(testrec_expected_url2, 'jekyll', 'j123ekyll', expected_text='TEST'), []) def test_fft_check_md5_through_bibrecdoc_str(self): """bibupload - simple FFT insert, check md5 through BibRecDocs.str()""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %s/img/head.gif """ % CFG_SITE_URL # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) original_md5 = md5(urlopen('%s/img/head.gif' % CFG_SITE_URL).read()).hexdigest() bibrec_str = str(BibRecDocs(int(recid))) md5_found = False for row in bibrec_str.split('\n'): if 'checksum' in row: if original_md5 in row: md5_found = True self.failUnless(md5_found) def test_detailed_fft_insert(self): """bibupload - detailed FFT insert""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif SuperMain This is a description This is a comment CIDIESSE %(siteurl)s/img/rss.png SuperMain .jpeg This is a description This is a second comment CIDIESSE """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/CIDIESSE.gif This is a description This is a comment 530 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/CIDIESSE.jpeg This is a description This is a second comment """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url1 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/CIDIESSE.gif" % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url2 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/CIDIESSE.jpeg" % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url1 = testrec_expected_url1.replace('123456789', str(recid)) testrec_expected_url2 = testrec_expected_url1.replace('123456789', str(recid)) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self.failUnless(try_url_download(testrec_expected_url1)) self.failUnless(try_url_download(testrec_expected_url2)) def test_simple_fft_insert_with_restriction(self): """bibupload - simple FFT insert with restriction""" # define the test case: email_tag = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][0:3] email_ind1 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][3] email_ind2 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][4] email_code = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][5] test_to_upload = """ SzGeCERN Test, John Test University jekyll@cds.cern.ch ARTICLE %(siteurl)s/img/site_logo.gif thesis %(siteurl)s/img/sb.gif """ % {'email_tag': email_tag, 'email_ind1': email_ind1 == '_' and ' ' or email_ind1, 'email_ind2': email_ind2 == '_' and ' ' or email_ind2, 'email_code': email_code, 'siteurl': CFG_SITE_URL} testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University jekyll@cds.cern.ch 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif 79 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif?subformat=icon icon ARTICLE """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, 'email_tag': email_tag, 'email_ind1': email_ind1 == '_' and ' ' or email_ind1, 'email_ind2': email_ind2 == '_' and ' ' or email_ind2, 'email_code': email_code} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_icon = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif?subformat=icon" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) testrec_expected_icon = testrec_expected_icon.replace('123456789', str(recid)) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self.assertEqual(test_web_page_content(testrec_expected_icon, 'jekyll', 'j123ekyll', expected_text=urlopen('%(siteurl)s/img/sb.gif' % { 'siteurl': CFG_SITE_URL }).read()), []) self.assertEqual(test_web_page_content(testrec_expected_icon, 'hyde', 'h123yde', expected_text='Authorization failure'), []) self.force_webcoll(recid) self.assertEqual(test_web_page_content(testrec_expected_icon, 'hyde', 'h123yde', expected_text=urlopen('%(siteurl)s/img/restricted.gif' % {'siteurl': CFG_SITE_URL}).read()), []) self.failUnless("HTTP Error 401: Unauthorized" in test_web_page_content(testrec_expected_url, 'hyde', 'h123yde')[0]) self.failUnless("This file is restricted." in urlopen(testrec_expected_url).read()) def test_simple_fft_insert_with_icon(self): """bibupload - simple FFT insert with icon""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif %(siteurl)s/img/sb.gif """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif 79 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif?subformat=icon icon """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_icon = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif?subformat=icon" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) testrec_expected_icon = testrec_expected_icon.replace('123456789', str(recid)) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self.failUnless(try_url_download(testrec_expected_url)) self.failUnless(try_url_download(testrec_expected_icon)) def test_multiple_fft_insert(self): """bibupload - multiple FFT insert""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif %(siteurl)s/img/head.gif %(siteurl)s/%(CFG_SITE_RECORD)s/95/files/9809057.pdf %(prefix)s/var/tmp/demobibdata.xml """ % { 'prefix': CFG_PREFIX, 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 295078 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/9809057.pdf %(sizeofdemobibdata)s %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/demobibdata.xml 208 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/head.gif 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, 'sizeofdemobibdata': os.path.getsize(os.path.join(CFG_TMPDIR, "demobibdata.xml"))} # insert test record: testrec_expected_urls = [] for files in ('site_logo.gif', 'head.gif', '9809057.pdf', 'demobibdata.xml'): testrec_expected_urls.append('%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/%(files)s' % {'siteurl' : CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, 'files' : files}) recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_urls = [] for files in ('site_logo.gif', 'head.gif', '9809057.pdf', 'demobibdata.xml'): testrec_expected_urls.append('%(siteurl)s/%(CFG_SITE_RECORD)s/%(recid)s/files/%(files)s' % {'siteurl' : CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, 'files' : files, 'recid' : recid}) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) for url in testrec_expected_urls: self.failUnless(try_url_download(url)) self._test_bibdoc_status(recid, 'head', '') self._test_bibdoc_status(recid, '9809057', '') self._test_bibdoc_status(recid, 'site_logo', '') self._test_bibdoc_status(recid, 'demobibdata', '') def test_simple_fft_correct(self): """bibupload - simple FFT correct""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif """ % { 'siteurl': CFG_SITE_URL } test_to_correct = """ 123456789 %(siteurl)s/img/sb.gif site_logo """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 79 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_correct = test_to_correct.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_correct) bibupload.bibupload_records(recs, opt_mode='correct')[0] self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(try_url_download(testrec_expected_url)) self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self._test_bibdoc_status(recid, 'site_logo', '') def test_fft_correct_already_exists(self): """bibupload - FFT correct with already identical existing file""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif a description %(siteurl)s/img/help.png site_logo another description %(siteurl)s/img/rss.png %(siteurl)s/img/line.gif %(siteurl)s/img/merge.png line """ % { 'siteurl': CFG_SITE_URL } test_to_correct = """ 123456789 %(siteurl)s/img/site_logo.gif a second description %(siteurl)s/img/help.png site_logo another second description %(siteurl)s/img/refresh.png rss %(siteurl)s/img/line.gif %(siteurl)s/img/merge-small.png line """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 35 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/line.gif 626 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/line.png 432 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/rss.png 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif a second description 786 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.png another second description """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url2 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/rss.png" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url3 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.png" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url4 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/line.png" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url5 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/line.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) _, recid, _ = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) testrec_expected_url2 = testrec_expected_url2.replace('123456789', str(recid)) testrec_expected_url3 = testrec_expected_url3.replace('123456789', str(recid)) testrec_expected_url4 = testrec_expected_url4.replace('123456789', str(recid)) testrec_expected_url5 = testrec_expected_url5.replace('123456789', str(recid)) test_to_correct = test_to_correct.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_correct) bibupload.bibupload(recs[0], opt_mode='correct') self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(try_url_download(testrec_expected_url)) self.failUnless(try_url_download(testrec_expected_url2)) self.failUnless(try_url_download(testrec_expected_url3)) self.failUnless(try_url_download(testrec_expected_url4)) self.failUnless(try_url_download(testrec_expected_url5)) self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) bibrecdocs = BibRecDocs(recid) self.failUnless(bibrecdocs.get_bibdoc('rss').list_versions(), [1, 2]) self.failUnless(bibrecdocs.get_bibdoc('site_logo').list_versions(), [1]) self.failUnless(bibrecdocs.get_bibdoc('line').list_versions(), [1, 2]) def test_fft_correct_modify_doctype(self): """bibupload - FFT correct with different doctype""" test_to_upload = """ SzGeCERN %(siteurl)s/img/site_logo.gif a description TEST1 """ % { 'siteurl': CFG_SITE_URL } test_to_correct = """ 123456789 site_logo TEST2 """ testrec_expected_xm = """ 123456789 SzGeCERN 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif a description """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) _, recid, _ = bibupload.bibupload(recs[0], opt_mode='insert') # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) bibrecdocs = BibRecDocs(recid) self.failUnless(bibrecdocs.get_bibdoc('site_logo').doctype, 'TEST1') # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_correct) bibupload.bibupload(recs[0], opt_mode='correct') # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) bibrecdocs = BibRecDocs(recid) self.failUnless(bibrecdocs.get_bibdoc('site_logo').doctype, 'TEST2') def test_fft_append_already_exists(self): """bibupload - FFT append with already identical existing file""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif a description """ % { 'siteurl': CFG_SITE_URL } test_to_append = """ 123456789 %(siteurl)s/img/site_logo.gif a second description %(siteurl)s/img/help.png site_logo another second description """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif a description 786 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.png another second description """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url2 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.png" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) _, recid, _ = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_append = test_to_append.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_append) err, recid, msg = bibupload.bibupload(recs[0], opt_mode='append') self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(try_url_download(testrec_expected_url)) self.failUnless(try_url_download(testrec_expected_url2)) self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) def test_fft_implicit_fix_marc(self): """bibupload - FFT implicit FIX-MARC""" test_to_upload = """ SzGeCERN Test, John Test University foo@bar.com %(siteurl)s/img/site_logo.gif """ % { 'siteurl': CFG_SITE_URL } test_to_correct = """ 123456789 foo@bar.com %(siteurl)s/img/site_logo.gif %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University foo@bar.com %(siteurl)s/img/site_logo.gif """ % { 'siteurl': CFG_SITE_URL } recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: test_to_correct = test_to_correct.replace('123456789', str(recid)) testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) # correct test record with implicit FIX-MARC: recs = bibupload.xml_marc_to_records(test_to_correct) bibupload.bibupload_records(recs, opt_mode='correct')[0] self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) def test_fft_vs_bibedit(self): """bibupload - FFT Vs. BibEdit compatibility""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif """ % { 'siteurl': CFG_SITE_URL } test_to_replace = """ 123456789 SzGeCERN Test, John Test University http://www.google.com/ 2032 BibEdit Comment %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif BibEdit Description 01 http://cern.ch/ """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_xm = str(test_to_replace) testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_replace = test_to_replace.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_replace) bibupload.bibupload_records(recs, opt_mode='replace')[0] self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(try_url_download(testrec_expected_url)) self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self._test_bibdoc_status(recid, 'site_logo', '') bibrecdocs = BibRecDocs(recid) bibdoc = bibrecdocs.get_bibdoc('site_logo') self.assertEqual(bibdoc.get_description('.gif'), 'BibEdit Description') def test_detailed_fft_correct(self): """bibupload - detailed FFT correct """ # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif Try Comment """ % { 'siteurl': CFG_SITE_URL } test_to_correct = """ 123456789 %(siteurl)s/img/head.gif site_logo patata Next Try KEEP-OLD-VALUE """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 208 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif Next Try Comment """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_correct = test_to_correct.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_correct) bibupload.bibupload_records(recs, opt_mode='correct') self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(try_url_download(testrec_expected_url)) self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self._test_bibdoc_status(recid, 'patata', '') def test_no_url_fft_correct(self): """bibupload - no_url FFT correct""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif Try Comment """ % { 'siteurl': CFG_SITE_URL } test_to_correct = """ 123456789 site_logo patata .gif KEEP-OLD-VALUE Next Comment """ testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif Try Next Comment """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_correct = test_to_correct.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_correct) bibupload.bibupload_records(recs, opt_mode='correct')[0] self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(try_url_download(testrec_expected_url)) self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self._test_bibdoc_status(recid, 'patata', '') def test_new_icon_fft_append(self): """bibupload - new icon FFT append""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University """ test_to_correct = """ 123456789 site_logo %(siteurl)s/img/site_logo.gif """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif?subformat=icon icon """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_correct = test_to_correct.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_correct) bibupload.bibupload_records(recs, opt_mode='append')[0] self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self.failUnless(try_url_download(testrec_expected_url)) self._test_bibdoc_status(recid, 'site_logo', '') def test_multiple_fft_correct(self): """bibupload - multiple FFT correct""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif Try Comment Restricted %(siteurl)s/img/okay.gif site_logo .jpeg Try jpeg Comment jpeg Restricted """ % { 'siteurl': CFG_SITE_URL } test_to_correct = """ 123456789 %(siteurl)s/img/loading.gif site_logo patata .gif New restricted """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 9427 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_correct = test_to_correct.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_correct) bibupload.bibupload_records(recs, opt_mode='correct')[0] self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless("This file is restricted." in urlopen(testrec_expected_url).read()) self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self._test_bibdoc_status(recid, 'patata', 'New restricted') def test_purge_fft_correct(self): """bibupload - purge FFT correct""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/site_logo.gif %(siteurl)s/img/head.gif """ % { 'siteurl': CFG_SITE_URL } test_to_correct = """ 123456789 %(siteurl)s/img/site_logo.gif """ % { 'siteurl': CFG_SITE_URL } test_to_purge = """ 123456789 %(siteurl)s/img/site_logo.gif PURGE """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif 208 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/head.gif """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_correct = test_to_correct.replace('123456789', str(recid)) test_to_purge = test_to_purge.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_correct) bibupload.bibupload_records(recs, opt_mode='correct')[0] self.check_record_consistency(recid) # purge test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_purge) bibupload.bibupload_records(recs, opt_mode='correct') self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(try_url_download(testrec_expected_url)) self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self._test_bibdoc_status(recid, 'site_logo', '') self._test_bibdoc_status(recid, 'head', '') def test_revert_fft_correct(self): """bibupload - revert FFT correct""" # define the test case: email_tag = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][0:3] email_ind1 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][3] email_ind2 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][4] email_code = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][5] test_to_upload = """ SzGeCERN Test, John Test University jekyll@cds.cern.ch %(siteurl)s/img/iconpen.gif site_logo """ % { 'siteurl': CFG_SITE_URL, 'email_tag': email_tag, 'email_ind1': email_ind1 == '_' and ' ' or email_ind1, 'email_ind2': email_ind2 == '_' and ' ' or email_ind2, 'email_code': email_code} test_to_correct = """ 123456789 %s/img/head.gif site_logo """ % CFG_SITE_URL test_to_revert = """ 123456789 site_logo REVERT 1 """ testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University jekyll@cds.cern.ch 171 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, 'email_tag': email_tag, 'email_ind1': email_ind1 == '_' and ' ' or email_ind1, 'email_ind2': email_ind2 == '_' and ' ' or email_ind2, 'email_code': email_code} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_correct = test_to_correct.replace('123456789', str(recid)) test_to_revert = test_to_revert.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_correct) bibupload.bibupload_records(recs, opt_mode='correct') self.check_record_consistency(recid) # revert test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_revert) bibupload.bibupload_records(recs, opt_mode='correct') self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(try_url_download(testrec_expected_url)) self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self._test_bibdoc_status(recid, 'site_logo', '') expected_content_version1 = urlopen('%s/img/iconpen.gif' % CFG_SITE_URL).read() expected_content_version2 = urlopen('%s/img/head.gif' % CFG_SITE_URL).read() expected_content_version3 = expected_content_version1 self.assertEqual(test_web_page_content('%s/%s/%s/files/site_logo.gif?version=1' % (CFG_SITE_URL, CFG_SITE_RECORD, recid), 'jekyll', 'j123ekyll', expected_content_version1), []) self.assertEqual(test_web_page_content('%s/%s/%s/files/site_logo.gif?version=2' % (CFG_SITE_URL, CFG_SITE_RECORD, recid), 'jekyll', 'j123ekyll', expected_content_version2), []) self.assertEqual(test_web_page_content('%s/%s/%s/files/site_logo.gif?version=3' % (CFG_SITE_URL, CFG_SITE_RECORD, recid), 'jekyll', 'j123ekyll', expected_content_version3), []) def test_simple_fft_replace(self): """bibupload - simple FFT replace""" # define the test case: email_tag = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][0:3] email_ind1 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][3] email_ind2 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][4] email_code = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][5] test_to_upload = """ SzGeCERN Test, John Test University jekyll@cds.cern.ch %(siteurl)s/img/iconpen.gif site_logo """ % {'siteurl': CFG_SITE_URL, 'email_tag': email_tag, 'email_ind1': email_ind1 == '_' and ' ' or email_ind1, 'email_ind2': email_ind2 == '_' and ' ' or email_ind2, 'email_code': email_code} test_to_replace = """ 123456789 SzGeCERN Test, John Test University jekyll@cds.cern.ch %(siteurl)s/img/head.gif """ % {'siteurl': CFG_SITE_URL, 'email_tag': email_tag, 'email_ind1': email_ind1 == '_' and ' ' or email_ind1, 'email_ind2': email_ind2 == '_' and ' ' or email_ind2, 'email_code': email_code} testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University jekyll@cds.cern.ch 208 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/head.gif """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, 'email_tag': email_tag, 'email_ind1': email_ind1 == '_' and ' ' or email_ind1, 'email_ind2': email_ind2 == '_' and ' ' or email_ind2, 'email_code': email_code} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/head.gif" % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_replace = test_to_replace.replace('123456789', str(recid)) # replace test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_replace) bibupload.bibupload_records(recs, opt_mode='replace') self.check_record_consistency(recid) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(try_url_download(testrec_expected_url)) self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) expected_content_version = urlopen('%s/img/head.gif' % CFG_SITE_URL).read() self.assertEqual(test_web_page_content(testrec_expected_url, 'hyde', 'h123yde', expected_text='Authorization failure'), []) self.assertEqual(test_web_page_content(testrec_expected_url, 'jekyll', 'j123ekyll', expected_text=expected_content_version), []) def test_simple_fft_replace_or_insert(self): """bibupload - simple FFT replace_or_insert""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University %(siteurl)s/img/iconpen.gif site_logo """ % {'siteurl': CFG_SITE_URL,} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0] self.check_record_consistency(recid) ## When insert_or_replace a record for the first time, it should be like ## a simple insert, hence affected_fields should be empty. ## This also for the special case of FFT. affected_fields = run_sql("SELECT affected_fields FROM hstRECORD where id_bibrec=%s", (recid,)) self.assertEqual(len(affected_fields), 1) self.failIf(affected_fields[0][0]) def test_simple_fft_insert_with_modification_time(self): """bibupload - simple FFT insert with modification time""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University ARTICLE %(siteurl)s/img/site_logo.gif 2006-05-04 03:02:01 """ % { 'siteurl': CFG_SITE_URL } testrec_expected_xm = """ 123456789 SzGeCERN Test, John Test University 2032 %(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif ARTICLE """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} testrec_expected_url2 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0] self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_xm = testrec_expected_xm.replace('123456789', str(recid)) testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) testrec_expected_url2 = testrec_expected_url2.replace('123456789', str(recid)) # compare expected results: inserted_xm = print_record(recid, 'xm') self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True)) self.failUnless(try_url_download(testrec_expected_url)) self.force_webcoll(recid) self.tear_down = True self.assertEqual(test_web_page_content(testrec_expected_url2, expected_text='04 May 2006, 03:02'), []) def test_multiple_fft_insert_with_modification_time(self): """bibupload - multiple FFT insert with modification time""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University ARTICLE %(siteurl)s/img/site_logo.gif 2006-05-04 03:02:01 %(siteurl)s/img/head.gif 2007-05-04 03:02:01 %(siteurl)s/%(CFG_SITE_RECORD)s/95/files/9809057.pdf 2008-05-04 03:02:01 %(prefix)s/var/tmp/demobibdata.xml 2009-05-04 03:02:01 """ % { 'prefix': CFG_PREFIX, 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, } testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) self.force_webcoll(recid) self.assertEqual(test_web_page_content(testrec_expected_url, expected_text=['04 May 2006, 03:02', '04 May 2007, 03:02', '04 May 2008, 03:02', '04 May 2009, 03:02']), []) def test_simple_fft_correct_with_modification_time(self): """bibupload - simple FFT correct with modification time""" # define the test case: test_to_upload = """ SzGeCERN Test, John Test University ARTICLE %(siteurl)s/img/site_logo.gif 2007-05-04 03:02:01 """ % { 'siteurl': CFG_SITE_URL } test_to_correct = """ 123456789 %(siteurl)s/img/sb.gif site_logo 2008-05-04 03:02:01 """ % { 'siteurl': CFG_SITE_URL } testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/" \ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD} # insert test record: recs = bibupload.xml_marc_to_records(test_to_upload) dummy, recid, dummy = bibupload.bibupload(recs[0], opt_mode='insert') self.check_record_consistency(recid) # replace test buffers with real recid of inserted test record: testrec_expected_url = testrec_expected_url.replace('123456789', str(recid)) test_to_correct = test_to_correct.replace('123456789', str(recid)) # correct test record with new FFT: recs = bibupload.xml_marc_to_records(test_to_correct) err, recid, msg = bibupload.bibupload(recs[0], opt_mode='correct') self.check_record_consistency(recid) self.force_webcoll(recid) self.assertEqual(test_web_page_content(testrec_expected_url, expected_text=['04 May 2008, 03:02']), []) TEST_SUITE = make_test_suite(BibUploadNoUselessHistoryTest, BibUploadHoldingPenTest, BibUploadInsertModeTest, BibUploadAppendModeTest, BibUploadCorrectModeTest, BibUploadDeleteModeTest, BibUploadReplaceModeTest, BibUploadReferencesModeTest, BibUploadRecordsWithSYSNOTest, BibUploadRecordsWithEXTOAIIDTest, BibUploadRecordsWithOAIIDTest, BibUploadIndicatorsTest, BibUploadUpperLowerCaseTest, BibUploadControlledProvenanceTest, BibUploadStrongTagsTest, BibUploadFFTModeTest, BibUploadPretendTest, BibUploadCallbackURLTest, BibUploadMoreInfoTest, BibUploadBibRelationsTest, BibUploadRecordsWithDOITest, BibUploadTypicalBibEditSessionTest, BibUploadRealCaseRemovalDOIViaBibEdit, ) if __name__ == "__main__": run_test_suite(TEST_SUITE, warn_user=True) diff --git a/modules/miscutil/lib/inveniocfg.py b/modules/miscutil/lib/inveniocfg.py index 78c1caa2d..3f1bee5f1 100644 --- a/modules/miscutil/lib/inveniocfg.py +++ b/modules/miscutil/lib/inveniocfg.py @@ -1,1785 +1,1785 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. # Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Invenio configuration and administration CLI tool. Usage: inveniocfg [options] General options: -h, --help print this help -V, --version print version number Options to finish your installation: --create-apache-conf create Apache configuration files --create-tables create DB tables for Invenio --load-bibfield-conf load the BibField configuration --load-webstat-conf load the WebStat configuration --drop-tables drop DB tables of Invenio --check-openoffice check for correctly set up of openoffice temporary directory Options to set up and test a demo site: --create-demo-site create demo site --load-demo-records load demo records --remove-demo-records remove demo records, keeping demo site --drop-demo-site drop demo site configurations too --run-unit-tests run unit test suite (needs demo site) --run-regression-tests run regression test suite (needs demo site) --run-web-tests run web tests in a browser (needs demo site, Firefox, Selenium IDE) Options to update config files in situ: --update-all perform all the update options --update-config-py update config.py file from invenio.conf file --update-dbquery-py update dbquery.py with DB credentials from invenio.conf --update-dbexec update dbexec with DB credentials from invenio.conf --update-bibconvert-tpl update bibconvert templates with CFG_SITE_URL from invenio.conf --update-web-tests update web test cases with CFG_SITE_URL from invenio.conf Options to update DB tables: --reset-all perform all the reset options --reset-sitename reset tables to take account of new CFG_SITE_NAME* --reset-siteadminemail reset tables to take account of new CFG_SITE_ADMIN_EMAIL --reset-fieldnames reset tables to take account of new I18N names from PO files --reset-recstruct-cache reset record structure cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE --reset-recjson-cache reset record json cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE Options to upgrade your installation: --upgrade apply all pending upgrades --upgrade-check run pre-upgrade checks for all pending upgrades --upgrade-show-pending show pending upgrades ready to be applied --upgrade-show-applied show history of applied upgrades --upgrade-create-standard-recipe create a new upgrade recipe (for developers) --upgrade-create-release-recipe create a new release upgrade recipe (for developers) Options to help the work: --list print names and values of all options from conf files --get get value of a given option from conf files --conf-dir path to directory where invenio*.conf files are [optional] --detect-system-details print system details such as Apache/Python/MySQL versions """ __revision__ = "$Id$" from ConfigParser import ConfigParser from optparse import OptionParser, OptionGroup, IndentedHelpFormatter, Option, \ OptionError import os import re import shutil import socket import sys import zlib def print_usage(): """Print help.""" print __doc__ def get_version(): """ Get running version of Invenio """ from invenio.config import CFG_VERSION return CFG_VERSION def print_version(): """Print version information.""" print get_version() def convert_conf_option(option_name, option_value): """ Convert conf option into Python config.py line, converting values to ints or strings as appropriate. """ ## 1) convert option name to uppercase: option_name = option_name.upper() ## 1a) adjust renamed variables: if option_name in ['CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_DOCTYPES', 'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_RESTRICTIONS', 'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_MISC', 'CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT', 'CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS', 'CFG_WEBSUBMIT_DESIRED_CONVERSIONS']: new_option_name = option_name.replace('WEBSUBMIT', 'BIBDOCFILE') print >> sys.stderr, ("""WARNING: %s has been renamed to %s. Please, update your invenio-local.conf file accordingly.""" % (option_name, new_option_name)) option_name = new_option_name ## 2) convert option value to int or string: if option_name in ['CFG_BIBUPLOAD_REFERENCE_TAG', 'CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG', 'CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG', 'CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG', 'CFG_BIBUPLOAD_STRONG_TAGS', 'CFG_BIBFORMAT_HIDDEN_TAGS']: # some options are supposed be string even when they look like # numeric option_value = '"' + option_value + '"' else: try: option_value = int(option_value) except ValueError: option_value = '"' + option_value + '"' ## 3a) special cases: chars regexps if option_name in ['CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS', 'CFG_BIBINDEX_CHARS_PUNCTUATION']: option_value = 'r"[' + option_value[1:-1] + ']"' ## 3abis) special cases: real regexps if option_name in ['CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES', 'CFG_BATCHUPLOADER_WEB_ROBOT_AGENTS', 'CFG_BIBUPLOAD_INTERNAL_DOI_PATTERN']: option_value = 'r"' + option_value[1:-1] + '"' ## 3b) special cases: True, False, None if option_value in ['"True"', '"False"', '"None"']: option_value = option_value[1:-1] ## 3c) special cases: dicts and real pythonic lists if option_name in ['CFG_WEBSEARCH_FIELDS_CONVERT', 'CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS', 'CFG_WEBSEARCH_FULLTEXT_SNIPPETS', 'CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS', 'CFG_SITE_EMERGENCY_EMAIL_ADDRESSES', 'CFG_BIBMATCH_FUZZY_WORDLIMITS', 'CFG_BIBMATCH_QUERY_TEMPLATES', 'CFG_WEBSEARCH_SYNONYM_KBRS', 'CFG_BIBINDEX_SYNONYM_KBRS', 'CFG_WEBCOMMENT_EMAIL_REPLIES_TO', 'CFG_WEBCOMMENT_RESTRICTION_DATAFIELD', 'CFG_WEBCOMMENT_ROUND_DATAFIELD', 'CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS', 'CFG_BIBSCHED_NODE_TASKS', 'CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE', 'CFG_OAI_METADATA_FORMATS', 'CFG_BIBDOCFILE_DESIRED_CONVERSIONS', 'CFG_BIBDOCFILE_BEST_FORMATS_TO_EXTRACT_TEXT_FROM', 'CFG_WEB_API_KEY_ALLOWED_URL', 'CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_MISC', 'CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_DOCTYPES', 'CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_RESTRICTIONS', 'CFG_REFEXTRACT_KBS_OVERRIDE', 'CFG_OPENID_CONFIGURATIONS', 'CFG_OAUTH1_CONFIGURATIONS', 'CFG_OAUTH2_CONFIGURATIONS', 'CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES', 'CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING', 'CFG_BIBSCHED_NON_CONCURRENT_TASKS', 'CFG_REDIS_HOSTS', 'CFG_BIBSCHED_INCOMPATIBLE_TASKS', 'CFG_ICON_CREATION_FORMAT_MAPPINGS', 'CFG_BIBEDIT_AUTOCOMPLETE']: try: option_value = option_value[1:-1] if option_name == "CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE" and option_value.strip().startswith("{"): print >> sys.stderr, ("""ERROR: CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE now accepts only a list of tuples, not a dictionary. Check invenio.conf for an example. Please, update your invenio-local.conf file accordingly.""") sys.exit(1) except TypeError: if option_name in ('CFG_WEBSEARCH_FULLTEXT_SNIPPETS',): print >> sys.stderr, """WARNING: CFG_WEBSEARCH_FULLTEXT_SNIPPETS has changed syntax: it can be customised to display different snippets for different document types. See the corresponding documentation in invenio.conf. You may want to customise your invenio-local.conf configuration accordingly.""" option_value = """{'': %s}""" % option_value else: print >> sys.stderr, "ERROR: type error in %s value %s." % \ (option_name, option_value) sys.exit(1) ## 3cbis) very special cases: dicts with backward compatible string if option_name in ['CFG_BIBINDEX_SPLASH_PAGES']: if option_value.startswith('"{') and option_value.endswith('}"'): option_value = option_value[1:-1] else: option_value = """{%s: ".*"}""" % option_value ## 3d) special cases: comma-separated lists if option_name in ['CFG_SITE_LANGS', 'CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS', 'CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS', 'CFG_BIBUPLOAD_STRONG_TAGS', 'CFG_BIBFORMAT_HIDDEN_TAGS', 'CFG_BIBFORMAT_HIDDEN_RECJSON_FIELDS', 'CFG_BIBSCHED_GC_TASKS_TO_REMOVE', 'CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE', 'CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS', 'CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS', 'CFG_BIBUPLOAD_DELETE_FORMATS', 'CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST', 'CFG_WEBSEARCH_RSS_I18N_COLLECTIONS', 'CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY', 'CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY', 'CFG_BIBCIRCULATION_ITEM_STATUS_OPTIONAL', 'CFG_PLOTEXTRACTOR_DISALLOWED_TEX', 'CFG_OAI_FRIENDS', 'CFG_WEBSTYLE_REVERSE_PROXY_IPS', 'CFG_BIBEDIT_AUTOCOMPLETE_INSTITUTIONS_FIELDS', 'CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS', 'CFG_BIBFORMAT_HIDDEN_FILE_FORMATS', 'CFG_BIBFIELD_MASTER_FORMATS', 'CFG_OPENID_PROVIDERS', 'CFG_OAUTH1_PROVIDERS', 'CFG_OAUTH2_PROVIDERS', 'CFG_BIBFORMAT_CACHED_FORMATS', 'CFG_BIBEDIT_ADD_TICKET_RT_QUEUES', 'CFG_BIBAUTHORID_ENABLED_REMOTE_LOGIN_SYSTEMS',]: out = "[" for elem in option_value[1:-1].split(","): if elem: elem = elem.strip() # string values out += "'%s', " % elem out += "]" option_value = out ## 3e) special cases: multiline if option_name == 'CFG_OAI_IDENTIFY_DESCRIPTION': # make triple quotes option_value = '""' + option_value + '""' ## 3f) ignore some options: if option_name.startswith('CFG_SITE_NAME_INTL'): # treated elsewhere return ## 3g) special cases: float if option_name in ['CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY', 'CFG_BIBMATCH_LOCAL_SLEEPTIME', 'CFG_BIBMATCH_REMOTE_SLEEPTIME', 'CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT', 'CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT']: option_value = float(option_value[1:-1]) ## 3h) special cases: bibmatch validation list if option_name in ['CFG_BIBMATCH_MATCH_VALIDATION_RULESETS']: option_value = option_value[1:-1] ## 4a) dropped variables if option_name in ['CFG_BATCHUPLOADER_WEB_ROBOT_AGENT']: print >> sys.stderr, ("""ERROR: CFG_BATCHUPLOADER_WEB_ROBOT_AGENT has been dropped in favour of CFG_BATCHUPLOADER_WEB_ROBOT_AGENTS. Please, update your invenio-local.conf file accordingly.""") sys.exit(1) option_value = option_value[1:-1] elif option_name in ['CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_DOCTYPES', 'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_RESTRICTIONS', 'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_MISC', 'CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT', 'CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS', 'CFG_WEBSUBMIT_DESIRED_CONVERSIONS']: new_option_name = option_name.replace('WEBSUBMIT', 'BIBDOCFILE') print >> sys.stderr, ("""ERROR: %s has been renamed to %s. Please, update your invenio-local.conf file accordingly.""" % (option_name, new_option_name)) option_name = new_option_name ## 5) finally, return output line: return '%s = %s' % (option_name, option_value) def cli_cmd_update_config_py(conf): """ Update new config.py from conf options, keeping previous config.py in a backup copy. """ ## NOTE: the following function exists also in urlutils.py ## However we can't import urlutils here, as it depends on config.py ## to already exist, while we are in the process of creating it. def get_relative_url(url): """ Returns the relative URL from a URL. For example: 'http://web.net' -> '' 'http://web.net/' -> '' 'http://web.net/1222' -> '/1222' 'http://web.net/wsadas/asd' -> '/wsadas/asd' It will never return a trailing "/". @param url: A url to transform @type url: str @return: relative URL """ # remove any protocol info before stripped_site_url = url.replace("://", "") baseurl = "/" + "/".join(stripped_site_url.split("/")[1:]) # remove any trailing slash ("/") if baseurl[-1] == "/": return baseurl[:-1] else: return baseurl print ">>> Going to update config.py..." ## location where config.py is: configpyfile = conf.get("Invenio", "CFG_PYLIBDIR") + \ os.sep + 'invenio' + os.sep + 'config.py' ## backup current config.py file: if os.path.exists(configpyfile): shutil.copy(configpyfile, configpyfile + '.OLD') ## here we go: fdesc = open(configpyfile, 'w') ## generate preamble: fdesc.write("# -*- coding: utf-8 -*-\n") fdesc.write("# DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED\n") fdesc.write("# FROM INVENIO.CONF BY EXECUTING:\n") fdesc.write("# " + " ".join(sys.argv) + "\n") ## special treatment for CFG_SITE_NAME_INTL options: fdesc.write("CFG_SITE_NAME_INTL = {}\n") for lang in conf.get("Invenio", "CFG_SITE_LANGS").split(","): fdesc.write("CFG_SITE_NAME_INTL['%s'] = \"%s\"\n" % (lang, conf.get("Invenio", "CFG_SITE_NAME_INTL_" + lang))) ## special treatment for CFG_SITE_SECURE_URL that may be empty, in ## which case it should be put equal to CFG_SITE_URL: if not conf.get("Invenio", "CFG_SITE_SECURE_URL"): conf.set("Invenio", "CFG_SITE_SECURE_URL", conf.get("Invenio", "CFG_SITE_URL")) ## Special treatment of base URL, adding CFG_BASE_URL base_url = get_relative_url(conf.get("Invenio", "CFG_SITE_URL")) fdesc.write("CFG_BASE_URL = \"%s\"\n" % (base_url,)) ## process all the options normally: sections = conf.sections() sections.sort() for section in sections: options = conf.options(section) options.sort() for option in options: if not option.upper().startswith('CFG_DATABASE_'): # put all options except for db credentials into config.py line_out = convert_conf_option(option, conf.get(section, option)) if line_out: fdesc.write(line_out + "\n") ## generate postamble: fdesc.write("") fdesc.write("# END OF GENERATED FILE") ## we are done: fdesc.close() print "You may want to restart Apache now." print ">>> config.py updated successfully." def cli_cmd_update_dbquery_py(conf): """ Update lib/dbquery.py file with DB parameters read from conf file. Note: this edits dbquery.py in situ, taking a backup first. Use only when you know what you are doing. """ print ">>> Going to update dbquery.py..." ## location where dbquery.py is: dbqueryconfigpyfile = conf.get("Invenio", "CFG_PYLIBDIR") + \ os.sep + 'invenio' + os.sep + 'dbquery_config.py' ## backup current dbquery.py file: if os.path.exists(dbqueryconfigpyfile + 'c'): shutil.copy(dbqueryconfigpyfile + 'c', dbqueryconfigpyfile + 'c.OLD') out = ["%s = '%s'\n" % (item.upper(), value) \ for item, value in conf.items('Invenio') \ if item.upper().startswith('CFG_DATABASE_')] fdesc = open(dbqueryconfigpyfile, 'w') fdesc.write("# -*- coding: utf-8 -*-\n") fdesc.writelines(out) fdesc.close() print "You may want to restart Apache now." print ">>> dbquery.py updated successfully." def cli_cmd_update_dbexec(conf): """ Update bin/dbexec file with DB parameters read from conf file. Note: this edits dbexec in situ, taking a backup first. Use only when you know what you are doing. """ print ">>> Going to update dbexec..." ## location where dbexec is: dbexecfile = conf.get("Invenio", "CFG_BINDIR") + \ os.sep + 'dbexec' ## backup current dbexec file: if os.path.exists(dbexecfile): shutil.copy(dbexecfile, dbexecfile + '.OLD') ## replace db parameters via sed: out = '' for line in open(dbexecfile, 'r').readlines(): match = re.search(r'^CFG_DATABASE_(HOST|PORT|NAME|USER|PASS|SLAVE)(\s*=\s*)\'.*\'$', line) if match: dbparam = 'CFG_DATABASE_' + match.group(1) out += "%s%s'%s'\n" % (dbparam, match.group(2), conf.get("Invenio", dbparam)) else: out += line fdesc = open(dbexecfile, 'w') fdesc.write(out) fdesc.close() print ">>> dbexec updated successfully." def cli_cmd_update_bibconvert_tpl(conf): """ Update bibconvert/config/*.tpl files looking for 856 http://.../CFG_SITE_RECORD lines, replacing URL with CFG_SITE_URL taken from conf file. Note: this edits tpl files in situ, taking a backup first. Use only when you know what you are doing. """ print ">>> Going to update bibconvert templates..." ## location where bibconvert/config/*.tpl are: tpldir = conf.get("Invenio", 'CFG_ETCDIR') + \ os.sep + 'bibconvert' + os.sep + 'config' ## find all *.tpl files: for tplfilename in os.listdir(tpldir): if tplfilename.endswith(".tpl"): ## change tpl file: tplfile = tpldir + os.sep + tplfilename shutil.copy(tplfile, tplfile + '.OLD') out = '' for line in open(tplfile, 'r').readlines(): match = re.search(r'^(.*)http://.*?/%s/(.*)$' % conf.get("Invenio", 'CFG_SITE_RECORD'), line) if match: out += "%s%s/%s/%s\n" % (match.group(1), conf.get("Invenio", 'CFG_SITE_URL'), conf.get("Invenio", 'CFG_SITE_RECORD'), match.group(2)) else: out += line fdesc = open(tplfile, 'w') fdesc.write(out) fdesc.close() print ">>> bibconvert templates updated successfully." def cli_cmd_update_web_tests(conf): """ Update web test cases lib/webtest/test_*.html looking for http://.+?[>> Going to update web tests..." ## location where test_*.html files are: testdir = conf.get("Invenio", 'CFG_PREFIX') + os.sep + \ 'lib' + os.sep + 'webtest' + os.sep + 'invenio' ## find all test_*.html files: for testfilename in os.listdir(testdir): if testfilename.startswith("test_") and \ testfilename.endswith(".html"): ## change test file: testfile = testdir + os.sep + testfilename shutil.copy(testfile, testfile + '.OLD') out = '' for line in open(testfile, 'r').readlines(): match = re.search(r'^(.*)http://.+?([)/opt/invenio(.*)$', line) if match: out += "%s%s%s\n" % (match.group(1), conf.get("Invenio", 'CFG_PREFIX'), match.group(2)) else: out += line fdesc = open(testfile, 'w') fdesc.write(out) fdesc.close() print ">>> web tests updated successfully." def cli_cmd_reset_sitename(conf): """ Reset collection-related tables with new CFG_SITE_NAME and CFG_SITE_NAME_INTL* read from conf files. """ print ">>> Going to reset CFG_SITE_NAME and CFG_SITE_NAME_INTL..." from invenio.dbquery import run_sql, IntegrityError # reset CFG_SITE_NAME: sitename = conf.get("Invenio", "CFG_SITE_NAME") try: run_sql("""INSERT INTO collection (id, name, dbquery, reclist) VALUES (1,%s,NULL,NULL)""", (sitename,)) except IntegrityError: run_sql("""UPDATE collection SET name=%s WHERE id=1""", (sitename,)) # reset CFG_SITE_NAME_INTL: for lang in conf.get("Invenio", "CFG_SITE_LANGS").split(","): sitename_lang = conf.get("Invenio", "CFG_SITE_NAME_INTL_" + lang) try: run_sql("""INSERT INTO collectionname (id_collection, ln, type, value) VALUES (%s,%s,%s,%s)""", (1, lang, 'ln', sitename_lang)) except IntegrityError: run_sql("""UPDATE collectionname SET value=%s WHERE ln=%s AND id_collection=1 AND type='ln'""", (sitename_lang, lang)) print "You may want to restart Apache now." print ">>> CFG_SITE_NAME and CFG_SITE_NAME_INTL* reset successfully." def cli_cmd_reset_recstruct_cache(conf): """If CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE is changed, this function will adapt the database to either store or not store the recstruct format.""" from invenio.intbitset import intbitset from invenio.dbquery import run_sql, serialize_via_marshal from invenio.search_engine import get_record, print_record from invenio.bibsched import server_pid, pidfile enable_recstruct_cache = conf.get("Invenio", "CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE") enable_recstruct_cache = enable_recstruct_cache in ('True', '1') pid = server_pid(ping_the_process=False) if pid: print >> sys.stderr, "ERROR: bibsched seems to run with pid %d, according to %s." % (pid, pidfile) print >> sys.stderr, " Please stop bibsched before running this procedure." sys.exit(1) if enable_recstruct_cache: print ">>> Searching records which need recstruct cache resetting; this may take a while..." all_recids = intbitset(run_sql("SELECT id FROM bibrec")) good_recids = intbitset(run_sql("SELECT bibrec.id FROM bibrec JOIN bibfmt ON bibrec.id = bibfmt.id_bibrec WHERE format='recstruct' AND modification_date < last_updated")) recids = all_recids - good_recids print ">>> Generating recstruct cache..." tot = len(recids) count = 0 for recid in recids: try: value = serialize_via_marshal(get_record(recid)) except zlib.error, err: print >> sys.stderr, "Looks like XM is corrupted for record %s. Let's recover it from bibxxx" % recid run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s AND format='xm'", (recid, )) xm_value = zlib.compress(print_record(recid, 'xm')) run_sql("INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, 'xm', NOW(), %s)", (recid, xm_value)) value = serialize_via_marshal(get_record(recid)) run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s AND format='recstruct'", (recid, )) - run_sql("INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, 'recstruct', NOW(), %s)", (recid, value)) + run_sql("INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, 'recstruct', NOW(), _binary %s)", (recid, value)) count += 1 if count % 1000 == 0: print " ... done records %s/%s" % (count, tot) if count % 1000 != 0: print " ... done records %s/%s" % (count, tot) print ">>> recstruct cache generated successfully." else: print ">>> Cleaning recstruct cache..." run_sql("DELETE FROM bibfmt WHERE format='recstruct'") def cli_cmd_reset_recjson_cache(conf): """If CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE is changed, this function will adapt the database to either store or not store the recjson format.""" try: import cPickle as pickle except: import pickle from invenio.intbitset import intbitset from invenio.dbquery import run_sql from invenio.bibfield import get_record from invenio.bibsched import server_pid, pidfile enable_recjson_cache = conf.get("Invenio", "CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE") enable_recjson_cache = enable_recjson_cache in ('True', '1') pid = server_pid(ping_the_process=False) if pid: print >> sys.stderr, "ERROR: bibsched seems to run with pid %d, according to %s." % (pid, pidfile) print >> sys.stderr, " Please stop bibsched before running this procedure." sys.exit(1) if enable_recjson_cache: print ">>> Searching records which need recjson cache resetting; this may take a while..." all_recids = intbitset(run_sql("SELECT id FROM bibrec")) #TODO: prevent doing all records? recids = all_recids print ">>> Generating recjson cache..." tot = len(recids) count = 0 cli_cmd_load_bibfield_config(conf) for recid in recids: run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s AND format='recjson'", (recid,)) #TODO: Update the cache or wait for the first access get_record(recid) count += 1 if count % 1000 == 0: print " ... done records %s/%s" % (count, tot) if count % 1000 != 0: print " ... done records %s/%s" % (count, tot) print ">>> recjson cache generated successfully." def cli_cmd_reset_siteadminemail(conf): """ Reset user-related tables with new CFG_SITE_ADMIN_EMAIL read from conf files. """ print ">>> Going to reset CFG_SITE_ADMIN_EMAIL..." from invenio.dbquery import run_sql siteadminemail = conf.get("Invenio", "CFG_SITE_ADMIN_EMAIL") run_sql("DELETE FROM user WHERE id=1") run_sql("""INSERT INTO user (id, email, password, note, nickname) VALUES (1, %s, AES_ENCRYPT(email, ''), 1, 'admin')""", (siteadminemail,)) print "You may want to restart Apache now." print ">>> CFG_SITE_ADMIN_EMAIL reset successfully." def cli_cmd_reset_fieldnames(conf): """ Reset I18N field names such as author, title, etc and other I18N ranking method names such as word similarity. Their translations are taken from the PO files. """ print ">>> Going to reset I18N field names..." from invenio.messages import gettext_set_language, language_list_long from invenio.dbquery import run_sql, IntegrityError ## get field id and name list: field_id_name_list = run_sql("SELECT id, name FROM field") ## get rankmethod id and name list: rankmethod_id_name_list = run_sql("SELECT id, name FROM rnkMETHOD") ## update names for every language: for lang, dummy in language_list_long(): _ = gettext_set_language(lang) ## this list is put here in order for PO system to pick names ## suitable for translation field_name_names = {"any field": _("any field"), "title": _("title"), "author": _("author"), "abstract": _("abstract"), "keyword": _("keyword"), "report number": _("report number"), "subject": _("subject"), "reference": _("reference"), "fulltext": _("fulltext"), "collection": _("collection"), "division": _("division"), "year": _("year"), "journal": _("journal"), "experiment": _("experiment"), "record ID": _("record ID")} ## update I18N names for every language: for (field_id, field_name) in field_id_name_list: if field_name_names.has_key(field_name): try: run_sql("""INSERT INTO fieldname (id_field,ln,type,value) VALUES (%s,%s,%s,%s)""", (field_id, lang, 'ln', field_name_names[field_name])) except IntegrityError: run_sql("""UPDATE fieldname SET value=%s WHERE id_field=%s AND ln=%s AND type=%s""", (field_name_names[field_name], field_id, lang, 'ln',)) ## ditto for rank methods: rankmethod_name_names = {"wrd": _("word similarity"), "demo_jif": _("journal impact factor"), "citation": _("times cited"), "citerank_citation_t": _("time-decay cite count"), "citerank_pagerank_c": _("all-time-best cite rank"), "citerank_pagerank_t": _("time-decay cite rank"),} for (rankmethod_id, rankmethod_name) in rankmethod_id_name_list: if rankmethod_name_names.has_key(rankmethod_name): try: run_sql("""INSERT INTO rnkMETHODNAME (id_rnkMETHOD,ln,type,value) VALUES (%s,%s,%s,%s)""", (rankmethod_id, lang, 'ln', rankmethod_name_names[rankmethod_name])) except IntegrityError: run_sql("""UPDATE rnkMETHODNAME SET value=%s WHERE id_rnkMETHOD=%s AND ln=%s AND type=%s""", (rankmethod_name_names[rankmethod_name], rankmethod_id, lang, 'ln',)) print ">>> I18N field names reset successfully." def cli_check_openoffice(conf): """ If OpenOffice.org integration is enabled, checks whether the system is properly configured. """ from invenio.bibtask import check_running_process_user from invenio.websubmit_file_converter import can_unoconv, get_file_converter_logger logger = get_file_converter_logger() for handler in logger.handlers: logger.removeHandler(handler) check_running_process_user() print ">>> Checking if Libre/OpenOffice.org is correctly integrated...", sys.stdout.flush() if can_unoconv(True): print "ok" else: sys.exit(1) def test_db_connection(): """ Test DB connection, and if fails, advise user how to set it up. Useful to be called during table creation. """ print "Testing DB connection...", from invenio.textutils import wrap_text_in_a_box from invenio.dbquery import run_sql, Error ## first, test connection to the DB server: try: run_sql("SHOW TABLES") except Error, err: from invenio.dbquery import CFG_DATABASE_HOST, CFG_DATABASE_PORT, \ CFG_DATABASE_NAME, CFG_DATABASE_USER, CFG_DATABASE_PASS print wrap_text_in_a_box("""\ DATABASE CONNECTIVITY ERROR %(errno)d: %(errmsg)s.\n Perhaps you need to set up database and connection rights? If yes, then please login as MySQL admin user and run the following commands now: $ mysql -h %(dbhost)s -P %(dbport)s -u root -p mysql mysql> CREATE DATABASE %(dbname)s DEFAULT CHARACTER SET utf8; mysql> GRANT ALL PRIVILEGES ON %(dbname)s.* TO %(dbuser)s@%(webhost)s IDENTIFIED BY '%(dbpass)s'; mysql> QUIT The values printed above were detected from your configuration. If they are not right, then please edit your invenio-local.conf file and rerun 'inveniocfg --update-all' first. If the problem is of different nature, then please inspect the above error message and fix the problem before continuing.""" % \ {'errno': err.args[0], 'errmsg': err.args[1], 'dbname': CFG_DATABASE_NAME, 'dbhost': CFG_DATABASE_HOST, 'dbport': CFG_DATABASE_PORT, 'dbuser': CFG_DATABASE_USER, 'dbpass': CFG_DATABASE_PASS, 'webhost': CFG_DATABASE_HOST == 'localhost' and 'localhost' or os.popen('hostname -f', 'r').read().strip(), }) sys.exit(1) print "ok" ## second, test insert/select of a Unicode string to detect ## possible Python/MySQL/MySQLdb mis-setup: print "Testing Python/MySQL/MySQLdb UTF-8 chain...", try: try: beta_in_utf8 = "β" # Greek beta in UTF-8 is 0xCEB2 run_sql("CREATE TABLE test__invenio__utf8 (x char(1), y varbinary(2)) DEFAULT CHARACTER SET utf8 ENGINE=MyISAM;") run_sql("INSERT INTO test__invenio__utf8 (x, y) VALUES (%s, %s)", (beta_in_utf8, beta_in_utf8)) res = run_sql("SELECT x,y,HEX(x),HEX(y),LENGTH(x),LENGTH(y),CHAR_LENGTH(x),CHAR_LENGTH(y) FROM test__invenio__utf8") assert res[0] == ('\xce\xb2', '\xce\xb2', 'CEB2', 'CEB2', 2L, 2L, 1L, 2L) run_sql("DROP TABLE test__invenio__utf8") except Exception, err: print wrap_text_in_a_box("""\ DATABASE RELATED ERROR %s\n A problem was detected with the UTF-8 treatment in the chain between the Python application, the MySQLdb connector, and the MySQL database. You may perhaps have installed older versions of some prerequisite packages?\n Please check the INSTALL file and please fix this problem before continuing.""" % err) sys.exit(1) finally: run_sql("DROP TABLE IF EXISTS test__invenio__utf8") print "ok" def cli_cmd_create_tables(conf): """Create and fill Invenio DB tables. Useful for the installation process.""" print ">>> Going to create and fill tables..." from invenio.config import CFG_PREFIX test_db_connection() for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/tabcreate.sql" % (CFG_PREFIX, CFG_PREFIX), "%s/bin/dbexec < %s/lib/sql/invenio/tabfill.sql" % (CFG_PREFIX, CFG_PREFIX)]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) cli_cmd_reset_sitename(conf) cli_cmd_reset_siteadminemail(conf) cli_cmd_reset_fieldnames(conf) for cmd in ["%s/bin/webaccessadmin -u admin -c -a" % CFG_PREFIX]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Tables created and filled successfully." def cli_cmd_load_webstat_conf(conf): print ">>> Going to load WebStat config..." from invenio.config import CFG_PREFIX cmd = "%s/bin/webstatadmin --load-config" % CFG_PREFIX if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> WebStat config load successfully." def cli_cmd_load_bibfield_config(conf): print ">>> Going to load BibField config..." from invenio.bibfield_config_engine import BibFieldParser BibFieldParser.reparse() print ">>> BibField config load successfully." def cli_cmd_drop_tables(conf): """Drop Invenio DB tables. Useful for the uninstallation process.""" print ">>> Going to drop tables..." from invenio.config import CFG_PREFIX from invenio.textutils import wrap_text_in_a_box, wait_for_user from invenio.webstat import destroy_customevents wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy your database tables!""")) msg = destroy_customevents() if msg: print msg cmd = "%s/bin/dbexec < %s/lib/sql/invenio/tabdrop.sql" % (CFG_PREFIX, CFG_PREFIX) if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Tables dropped successfully." def cli_cmd_create_demo_site(conf): """Create demo site. Useful for testing purposes.""" print ">>> Going to create demo site..." from invenio.config import CFG_PREFIX from invenio.dbquery import run_sql run_sql("TRUNCATE schTASK") run_sql("TRUNCATE session") run_sql("DELETE FROM user WHERE email=''") for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/democfgdata.sql" % \ (CFG_PREFIX, CFG_PREFIX),]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) cli_cmd_reset_fieldnames(conf) # needed for I18N demo ranking method names for cmd in ["%s/bin/webaccessadmin -u admin -c -r -D" % CFG_PREFIX, "%s/bin/webcoll -u admin" % CFG_PREFIX, "%s/bin/webcoll 1" % CFG_PREFIX, "%s/bin/bibsort -u admin --load-config" % CFG_PREFIX, "%s/bin/bibsort 2" % CFG_PREFIX, ]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Demo site created successfully." def cli_cmd_load_demo_records(conf): """Load demo records. Useful for testing purposes.""" from invenio.config import CFG_PREFIX from invenio.dbquery import run_sql print ">>> Going to load demo records..." run_sql("TRUNCATE schTASK") for cmd in ["%s/bin/bibupload -u admin -i %s/var/tmp/demobibdata.xml" % (CFG_PREFIX, CFG_PREFIX), "%s/bin/bibupload 1" % CFG_PREFIX, "%s/bin/bibdocfile --textify --with-ocr --recid 97" % CFG_PREFIX, "%s/bin/bibdocfile --textify --all" % CFG_PREFIX, "%s/bin/bibindex -u admin" % CFG_PREFIX, "%s/bin/bibindex 2" % CFG_PREFIX, "%s/bin/bibindex -u admin -w global" % CFG_PREFIX, "%s/bin/bibindex 3" % CFG_PREFIX, "%s/bin/bibreformat -u admin -o HB" % CFG_PREFIX, "%s/bin/bibreformat 4" % CFG_PREFIX, "%s/bin/webcoll -u admin" % CFG_PREFIX, "%s/bin/webcoll 5" % CFG_PREFIX, "%s/bin/bibrank -u admin" % CFG_PREFIX, "%s/bin/bibrank 6" % CFG_PREFIX, "%s/bin/bibsort -u admin -R" % CFG_PREFIX, "%s/bin/bibsort 7" % CFG_PREFIX, "%s/bin/oairepositoryupdater -u admin" % CFG_PREFIX, "%s/bin/oairepositoryupdater 8" % CFG_PREFIX, "%s/bin/bibupload 9" % CFG_PREFIX,]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Demo records loaded successfully." def cli_cmd_remove_demo_records(conf): """Remove demo records. Useful when you are finished testing.""" print ">>> Going to remove demo records..." from invenio.config import CFG_PREFIX from invenio.dbquery import run_sql from invenio.textutils import wrap_text_in_a_box, wait_for_user wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy your records and documents!""")) if os.path.exists(CFG_PREFIX + os.sep + 'var' + os.sep + 'data'): shutil.rmtree(CFG_PREFIX + os.sep + 'var' + os.sep + 'data') run_sql("TRUNCATE schTASK") for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/tabbibclean.sql" % (CFG_PREFIX, CFG_PREFIX), "%s/bin/webcoll -u admin" % CFG_PREFIX, "%s/bin/webcoll 1" % CFG_PREFIX,]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Demo records removed successfully." def cli_cmd_drop_demo_site(conf): """Drop demo site completely. Useful when you are finished testing.""" print ">>> Going to drop demo site..." from invenio.textutils import wrap_text_in_a_box, wait_for_user wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy your site and documents!""")) cli_cmd_drop_tables(conf) cli_cmd_create_tables(conf) cli_cmd_remove_demo_records(conf) print ">>> Demo site dropped successfully." def cli_cmd_run_unit_tests(conf): """Run unit tests, usually on the working demo site.""" from invenio.testutils import build_and_run_unit_test_suite if not build_and_run_unit_test_suite(): sys.exit(1) def cli_cmd_run_js_unit_tests(conf): """Run JavaScript unit tests, usually on the working demo site.""" from invenio.testutils import build_and_run_js_unit_test_suite if not build_and_run_js_unit_test_suite(): sys.exit(1) def cli_cmd_run_regression_tests(conf): """Run regression tests, usually on the working demo site.""" from invenio.testutils import build_and_run_regression_test_suite if not build_and_run_regression_test_suite(): sys.exit(1) def cli_cmd_run_web_tests(conf): """Run web tests in a browser. Requires Firefox with Selenium.""" from invenio.testutils import build_and_run_web_test_suite if not build_and_run_web_test_suite(): sys.exit(1) def _detect_ip_address(conf): """Detect IP address of this computer. Useful for creating Apache vhost conf snippet on RHEL like machines. However, if wanted site is 0.0.0.0, then use that, since we are running inside Docker. @return: IP address, or '*' if cannot detect @rtype: string @note: creates socket for real in order to detect real IP address, not the loopback one. """ if '0.0.0.0' in conf.get('Invenio', 'CFG_SITE_URL'): return '0.0.0.0' try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('invenio-software.org', 0)) return s.getsockname()[0] except: return '*' def cli_cmd_create_apache_conf(conf): """ Create Apache conf files for this site, keeping previous files in a backup copy. """ print ">>> Going to create Apache conf files..." from invenio.textutils import wrap_text_in_a_box from invenio.access_control_config import CFG_EXTERNAL_AUTH_USING_SSO apache_conf_dir = conf.get("Invenio", 'CFG_ETCDIR') + \ os.sep + 'apache' if guess_apache_24(): directory_www_directive = """ # Uncomment the following on Apache < 2.4 # # Options FollowSymLinks MultiViews # AllowOverride None # Order allow,deny # Allow from all # # Comment the following on Apache < 2.4 Options FollowSymLinks MultiViews AllowOverride None Require all granted """ % {'webdir': conf.get('Invenio', 'CFG_WEBDIR')} directory_wsgi_directive = """ # Uncomment the following on Apache < 2.4 # # WSGIProcessGroup invenio # WSGIApplicationGroup %%{GLOBAL} # Options FollowSymLinks MultiViews # AllowOverride None # Order allow,deny # Allow from all # # Comment the following on Apache < 2.4 WSGIProcessGroup invenio WSGIApplicationGroup %%{GLOBAL} Options FollowSymLinks MultiViews AllowOverride None Require all granted """ % {'wsgidir': os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi')} else: directory_www_directive = """ # Comment the following on Apache >= 2.4 Options FollowSymLinks MultiViews AllowOverride None Order allow,deny Allow from all # Uncomment the following on Apache >= 2.4 # # Options FollowSymLinks MultiViews # AllowOverride None # Require all granted # """ % {'webdir': conf.get('Invenio', 'CFG_WEBDIR')} directory_wsgi_directive = """ # Comment the following on Apache >= 2.4 WSGIProcessGroup invenio WSGIApplicationGroup %%{GLOBAL} Options FollowSymLinks MultiViews AllowOverride None Order allow,deny Allow from all # Uncomment the following on Apache >= 2.4 # # WSGIProcessGroup invenio # WSGIApplicationGroup %%{GLOBAL} # Options FollowSymLinks MultiViews # AllowOverride None # Require all granted # """ % {'wsgidir': os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi')} ## Preparation of XSendFile directive xsendfile_directive_needed = int(conf.get("Invenio", 'CFG_BIBDOCFILE_USE_XSENDFILE')) != 0 if xsendfile_directive_needed: xsendfile_directive = "XSendFile On\n" else: xsendfile_directive = "#XSendFile On\n" for path in (conf.get('Invenio', 'CFG_BIBDOCFILE_FILEDIR'), # BibDocFile conf.get('Invenio', 'CFG_WEBDIR'), conf.get('Invenio', 'CFG_WEBSUBMIT_STORAGEDIR'), # WebSubmit conf.get('Invenio', 'CFG_TMPDIR'), os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'tmp', 'attachfile'), os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'data', 'comments'), os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'data', 'baskets', 'comments'), os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'lib', 'webdoc', 'invenio', 'info'), '/tmp'): # BibExport if xsendfile_directive_needed: xsendfile_directive += ' XSendFilePath %s\n' % path else: xsendfile_directive += ' #XSendFilePath %s\n' % path xsendfile_directive = xsendfile_directive.strip() ## Preparation of deflate directive deflate_directive_needed = int(conf.get("Invenio", 'CFG_WEBSTYLE_HTTP_USE_COMPRESSION')) != 0 if deflate_directive_needed: deflate_directive = r""" ## Configuration snippet taken from: ## SetOutputFilter DEFLATE # Netscape 4.x has some problems... BrowserMatch ^Mozilla/4 gzip-only-text/html # Netscape 4.06-4.08 have some more problems BrowserMatch ^Mozilla/4\.0[678] no-gzip # MSIE masquerades as Netscape, but it is fine # BrowserMatch \bMSIE !no-gzip !gzip-only-text/html # NOTE: Due to a bug in mod_setenvif up to Apache 2.0.48 # the above regex won't work. You can use the following # workaround to get the desired effect: BrowserMatch \bMSI[E] !no-gzip !gzip-only-text/html # Don't compress images SetEnvIfNoCase Request_URI \ \.(?:gif|jpe?g|png)$ no-gzip dont-vary # Make sure proxies don't deliver the wrong content Header append Vary User-Agent env=!dont-vary """ else: deflate_directive = "" if CFG_EXTERNAL_AUTH_USING_SSO: shibboleth_directive = r""" SSLRequireSSL # The modules only work using HTTPS AuthType shibboleth ShibRequireSession On ShibRequireAll On ShibExportAssertion Off require valid-user """ else: shibboleth_directive = "" ## Apache vhost conf file is distro specific, so analyze needs: # Gentoo (and generic defaults): listen_directive_needed = True ssl_pem_directive_needed = False ssl_pem_path = '/etc/apache2/ssl/apache.pem' ssl_crt_path = '/etc/apache2/ssl/server.crt' ssl_key_path = '/etc/apache2/ssl/server.key' vhost_ip_address_needed = False wsgi_socket_directive_needed = False # Debian: if os.path.exists(os.path.sep + 'etc' + os.path.sep + 'debian_version'): listen_directive_needed = False ssl_pem_directive_needed = True # RHEL/SLC: if os.path.exists(os.path.sep + 'etc' + os.path.sep + 'redhat-release'): listen_directive_needed = False ssl_crt_path = '/etc/pki/tls/certs/localhost.crt' ssl_key_path = '/etc/pki/tls/private/localhost.key' vhost_ip_address_needed = True if os.popen('grep -c "CentOS.*[67]\." /etc/redhat-release').read().strip() == '1': vhost_ip_address_needed = False wsgi_socket_directive_needed = True # maybe we are using non-standard ports? vhost_site_url = conf.get('Invenio', 'CFG_SITE_URL').replace("http://", "") if vhost_site_url.startswith("https://"): ## The installation is configured to require HTTPS for any connection vhost_site_url = vhost_site_url.replace("https://", "") vhost_site_url_port = '80' vhost_site_secure_url = conf.get('Invenio', 'CFG_SITE_SECURE_URL').replace("https://", "") vhost_site_secure_url_port = '443' if ':' in vhost_site_url: vhost_site_url, vhost_site_url_port = vhost_site_url.split(':', 1) if ':' in vhost_site_secure_url: vhost_site_secure_url, vhost_site_secure_url_port = vhost_site_secure_url.split(':', 1) if vhost_site_url_port != '80' or vhost_site_secure_url_port != '443': listen_directive_needed = True ## OK, let's create Apache vhost files: if not os.path.exists(apache_conf_dir): os.mkdir(apache_conf_dir) apache_vhost_file = apache_conf_dir + os.sep + \ 'invenio-apache-vhost.conf' apache_vhost_ssl_file = apache_conf_dir + os.sep + \ 'invenio-apache-vhost-ssl.conf' apache_vhost_body = """\ AddDefaultCharset UTF-8 ServerSignature Off ServerTokens Prod NameVirtualHost %(vhost_ip_address)s:%(vhost_site_url_port)s %(listen_directive)s %(wsgi_socket_directive)s WSGIPythonHome %(wsgi_python_home)s WSGIRestrictStdout Off deny from all deny from all ServerName %(servername)s ServerAlias %(serveralias)s ServerAdmin %(serveradmin)s DocumentRoot %(webdir)s %(directory_www_directive)s ErrorLog %(logdir)s/apache.err LogLevel warn LogFormat "%%h %%l %%u %%t \\"%%r\\" %%>s %%b \\"%%{Referer}i\\" \\"%%{User-agent}i\\" %%D" combined_with_timing CustomLog %(logdir)s/apache.log combined_with_timing DirectoryIndex index.en.html index.html Alias /static/ %(webdir)s/static/ Alias /img/ %(webdir)s/img/ Alias /css/ %(webdir)s/css/ Alias /js/ %(webdir)s/js/ Alias /flash/ %(webdir)s/flash/ Alias /export/ %(webdir)s/export/ Alias /MathJax/ %(webdir)s/MathJax/ Alias /jsCalendar/ %(webdir)s/jsCalendar/ Alias /ckeditor/ %(webdir)s/ckeditor/ Alias /mediaelement/ %(webdir)s/mediaelement/ AliasMatch /sitemap-(.*) %(webdir)s/sitemap-$1 Alias /robots.txt %(webdir)s/robots.txt Alias /favicon.ico %(webdir)s/favicon.ico WSGIDaemonProcess invenio processes=5 threads=1 display-name=%%{GROUP} inactivity-timeout=3600 maximum-requests=10000 %(wsgiuser)s WSGIImportScript %(wsgidir)s/invenio.wsgi process-group=invenio application-group=%%{GLOBAL} WSGIScriptAlias / %(wsgidir)s/invenio.wsgi WSGIPassAuthorization On %(xsendfile_directive)s %(directory_wsgi_directive)s %(deflate_directive)s """ % {'vhost_site_url_port': vhost_site_url_port, 'servername': vhost_site_url, 'serveralias': vhost_site_url.split('.')[0], 'serveradmin': conf.get('Invenio', 'CFG_SITE_ADMIN_EMAIL'), 'webdir': conf.get('Invenio', 'CFG_WEBDIR'), 'logdir': conf.get('Invenio', 'CFG_LOGDIR'), 'wsgidir': os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi'), 'wsgiuser': conf.get('Invenio', 'CFG_BIBSCHED_PROCESS_USER') and 'user='+conf.get('Invenio', 'CFG_BIBSCHED_PROCESS_USER'), 'vhost_ip_address': vhost_ip_address_needed and _detect_ip_address(conf) or '*', 'listen_directive': listen_directive_needed and 'Listen ' + vhost_site_url_port or \ '#Listen ' + vhost_site_url_port, 'wsgi_python_home': sys.prefix, 'wsgi_socket_directive': (wsgi_socket_directive_needed and \ 'WSGISocketPrefix ' or '#WSGISocketPrefix ') + \ conf.get('Invenio', 'CFG_PREFIX') + os.sep + 'var' + os.sep + 'run', 'xsendfile_directive': xsendfile_directive, 'directory_www_directive': directory_www_directive, 'directory_wsgi_directive': directory_wsgi_directive, 'deflate_directive': deflate_directive, } apache_vhost_ssl_body = """\ ServerSignature Off ServerTokens Prod %(listen_directive)s NameVirtualHost %(vhost_ip_address)s:%(vhost_site_secure_url_port)s %(ssl_pem_directive)s %(ssl_crt_directive)s %(ssl_key_directive)s %(ssl_protocol_directive)s %(ssl_cipher_directive)s WSGIRestrictStdout Off deny from all deny from all ServerName %(servername)s ServerAlias %(serveralias)s ServerAdmin %(serveradmin)s SSLEngine on DocumentRoot %(webdir)s %(directory_www_directive)s ErrorLog %(logdir)s/apache-ssl.err LogLevel warn LogFormat "%%h %%l %%u %%t \\"%%r\\" %%>s %%b \\"%%{Referer}i\\" \\"%%{User-agent}i\\" %%D" combined_with_timing CustomLog %(logdir)s/apache-ssl.log combined_with_timing DirectoryIndex index.en.html index.html Alias /static/ %(webdir)s/static/ Alias /img/ %(webdir)s/img/ Alias /css/ %(webdir)s/css/ Alias /js/ %(webdir)s/js/ Alias /flash/ %(webdir)s/flash/ Alias /export/ %(webdir)s/export/ Alias /MathJax/ %(webdir)s/MathJax/ Alias /jsCalendar/ %(webdir)s/jsCalendar/ Alias /ckeditor/ %(webdir)s/ckeditor/ Alias /mediaelement/ %(webdir)s/mediaelement/ AliasMatch /sitemap-(.*) %(webdir)s/sitemap-$1 Alias /robots.txt %(webdir)s/robots.txt Alias /favicon.ico %(webdir)s/favicon.ico RedirectMatch /sslredirect/(.*) http://$1 WSGIScriptAlias / %(wsgidir)s/invenio.wsgi WSGIPassAuthorization On %(xsendfile_directive)s %(directory_wsgi_directive)s %(deflate_directive)s %(shibboleth_directive)s """ % {'vhost_site_secure_url_port': vhost_site_secure_url_port, 'servername': vhost_site_secure_url, 'serveralias': vhost_site_secure_url.split('.')[0], 'serveradmin': conf.get('Invenio', 'CFG_SITE_ADMIN_EMAIL'), 'webdir': conf.get('Invenio', 'CFG_WEBDIR'), 'logdir': conf.get('Invenio', 'CFG_LOGDIR'), 'wsgidir' : os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi'), 'vhost_ip_address': vhost_ip_address_needed and _detect_ip_address(conf) or '*', 'listen_directive' : listen_directive_needed and 'Listen ' + vhost_site_secure_url_port or \ '#Listen ' + vhost_site_secure_url_port, 'ssl_pem_directive': ssl_pem_directive_needed and \ 'SSLCertificateFile %s' % ssl_pem_path or \ '#SSLCertificateFile %s' % ssl_pem_path, 'ssl_crt_directive': ssl_pem_directive_needed and \ '#SSLCertificateFile %s' % ssl_crt_path or \ 'SSLCertificateFile %s' % ssl_crt_path, 'ssl_key_directive': ssl_pem_directive_needed and \ '#SSLCertificateKeyFile %s' % ssl_key_path or \ 'SSLCertificateKeyFile %s' % ssl_key_path, 'ssl_protocol_directive': ssl_pem_directive_needed and \ 'SSLProtocol all -SSLv2 -SSLv3' or \ '#SSLProtocol all -SSLv2 -SSLv3', 'ssl_cipher_directive': ssl_pem_directive_needed and \ 'SSLCipherSuite HIGH:MEDIUM:!ADH' or \ '#SSLCipherSuite HIGH:MEDIUM:!ADH', 'xsendfile_directive': xsendfile_directive, 'directory_www_directive': directory_www_directive, 'directory_wsgi_directive': directory_wsgi_directive, 'deflate_directive': deflate_directive, 'shibboleth_directive': shibboleth_directive, } # write HTTP vhost snippet: if os.path.exists(apache_vhost_file): shutil.copy(apache_vhost_file, apache_vhost_file + '.OLD') fdesc = open(apache_vhost_file, 'w') fdesc.write(apache_vhost_body) fdesc.close() print print "Created file", apache_vhost_file # write HTTPS vhost snippet: vhost_ssl_created = False if conf.get('Invenio', 'CFG_SITE_SECURE_URL').startswith("https://"): if os.path.exists(apache_vhost_ssl_file): shutil.copy(apache_vhost_ssl_file, apache_vhost_ssl_file + '.OLD') fdesc = open(apache_vhost_ssl_file, 'w') fdesc.write(apache_vhost_ssl_body) fdesc.close() vhost_ssl_created = True print "Created file", apache_vhost_ssl_file print wrap_text_in_a_box("""\ Apache virtual host configuration file(s) for your Invenio site was(were) created. Please check created file(s) and activate virtual host(s). For example, you can put the following include statements in your httpd.conf:\n Include %s %s Please see the INSTALL file for more details. """ % (apache_vhost_file, (vhost_ssl_created and 'Include ' or '#Include ') + apache_vhost_ssl_file)) print ">>> Apache conf files created." def cli_cmd_get(conf, varname): """ Return value of VARNAME read from CONF files. Useful for third-party programs to access values of conf options such as CFG_PREFIX. Return None if VARNAME is not found. """ try: if not varname: raise Exception("ERROR: Please specify a configuration variable.") varname = varname.lower() # do not pay attention to section names yet: all_options = {} for section in conf.sections(): for option in conf.options(section): all_options[option] = conf.get(section, option) varvalue = all_options.get(varname, None) if varvalue is None: raise Exception() print varvalue except Exception, e: if e.message: print e.message sys.exit(1) def cli_cmd_list(conf): """ Print a list of all conf options and values from CONF. """ sections = conf.sections() sections.sort() for section in sections: options = conf.options(section) options.sort() for option in options: print option.upper(), '=', conf.get(section, option) def _grep_version_from_executable(path_to_exec, version_regexp): """ Try to detect a program version by digging into its binary PATH_TO_EXEC and looking for VERSION_REGEXP. Return program version as a string. Return empty string if not succeeded. """ from invenio.shellutils import run_shell_command exec_version = "" if os.path.exists(path_to_exec): dummy1, cmd2_out, dummy2 = run_shell_command("strings %s | grep %s", (path_to_exec, version_regexp)) if cmd2_out: for cmd2_out_line in cmd2_out.split("\n"): if len(cmd2_out_line) > len(exec_version): # the longest the better exec_version = cmd2_out_line return exec_version _RE_APACHE_MAJOR_VERSION = re.compile(r"Apache/(\d+\.\d+)") def guess_apache_24(apache_versions=None): """ Returns True if it looks like the system is running Apache 2.4 or later. """ if apache_versions is None: apache_versions = detect_apache_version() for apache_version in apache_versions: g = _RE_APACHE_MAJOR_VERSION.search(apache_version) if g: try: version = float(g.group(1)) except ValueError: continue if version >= 2.4: return True return False def detect_apache_version(): """ Try to detect Apache version by localizing httpd or apache executables and grepping inside binaries. Return list of all found Apache versions and paths. (For a given executable, the returned format is 'apache_version [apache_path]'.) Return empty list if no success. """ from invenio.shellutils import run_shell_command out = [] dummy1, cmd_out, dummy2 = run_shell_command("locate bin/httpd bin/apache") for apache in cmd_out.split("\n"): apache_version = _grep_version_from_executable(apache, '^Apache\/') if apache_version: out.append("%s [%s]" % (apache_version, apache)) return out def cli_cmd_detect_system_details(conf): """ Detect and print system details such as Apache/Python/MySQL versions etc. Useful for debugging problems on various OS. """ import MySQLdb print ">>> Going to detect system details..." print "* Hostname: " + socket.gethostname() print "* Invenio version: " + conf.get("Invenio", "CFG_VERSION") print "* Python version: " + sys.version.replace("\n", " ") print "* Apache version: " + ";\n ".join(detect_apache_version()) print "* MySQLdb version: " + MySQLdb.__version__ try: from invenio.dbquery import run_sql print "* MySQL version:" for key, val in run_sql("SHOW VARIABLES LIKE 'version%'") + \ run_sql("SHOW VARIABLES LIKE 'charact%'") + \ run_sql("SHOW VARIABLES LIKE 'collat%'"): if False: print " - %s: %s" % (key, val) elif key in ['version', 'character_set_client', 'character_set_connection', 'character_set_database', 'character_set_results', 'character_set_server', 'character_set_system', 'collation_connection', 'collation_database', 'collation_server']: print " - %s: %s" % (key, val) except ImportError: print "* ERROR: cannot import dbquery" print ">>> System details detected successfully." def cli_cmd_upgrade(conf): """ Command for applying upgrades """ from invenio.inveniocfg_upgrader import cmd_upgrade cmd_upgrade(conf) def cli_cmd_upgrade_check(conf): """ Command for running pre-upgrade checks """ from invenio.inveniocfg_upgrader import cmd_upgrade_check cmd_upgrade_check(conf) def cli_cmd_upgrade_show_pending(conf): """ Command for showing upgrades ready to be applied """ from invenio.inveniocfg_upgrader import cmd_upgrade_show_pending cmd_upgrade_show_pending(conf) def cli_cmd_upgrade_show_applied(conf): """ Command for showing all upgrades already applied. """ from invenio.inveniocfg_upgrader import cmd_upgrade_show_applied cmd_upgrade_show_applied(conf) def cli_cmd_upgrade_create_release_recipe(conf, path): """ Create a new release upgrade recipe (for developers). """ from invenio.inveniocfg_upgrader import cmd_upgrade_create_release_recipe cmd_upgrade_create_release_recipe(conf, path) def cli_cmd_upgrade_create_standard_recipe(conf, path, depends_on=None, release=False): """ Create a new upgrade recipe (for developers). """ from invenio.inveniocfg_upgrader import cmd_upgrade_create_standard_recipe cmd_upgrade_create_standard_recipe(conf, path, depends_on=depends_on, release=release) def prepare_option_parser(): """Parse the command line options.""" class InvenioOption(Option): """ Option class that implements the action 'store_append_const' which will 1) append to list in options. 2) take a value and store in options. Useful for e.g. appending a const to an actions list, while also taking an option value and storing it. This ensures that we can run actions in the order they are given on the command-line. Python 2.4 compatibility note: *append_const* action is not available in Python 2.4, so it is implemented here, together with the new action *store_append_const*. """ ACTIONS = Option.ACTIONS + ("store_append_const", "append_const") STORE_ACTIONS = Option.STORE_ACTIONS + ("store_append_const", "append_const") TYPED_ACTIONS = Option.TYPED_ACTIONS + ("store_append_const", ) ALWAYS_TYPED_ACTIONS = Option.ALWAYS_TYPED_ACTIONS + ("store_append_const", ) CONST_ACTIONS = getattr(Option, 'CONST_ACTIONS', ()) + ("store_append_const", "append_const") def take_action(self, action, dest, opt, value, values, parser): if action == "store_append_const": # Combination of 'store' and 'append_const' actions values.ensure_value(dest, []).append(self.const) value_dest = self.const.replace('-', '_') setattr(values, value_dest, value) elif action == "append_const" and not hasattr(Option, 'CONST_ACTIONS'): values.ensure_value(dest, []).append(self.const) else: Option.take_action(self, action, dest, opt, value, values, parser) def _check_const(self): if self.action not in self.CONST_ACTIONS and self.const is not None: raise OptionError( "'const' must not be supplied for action %r" % self.action, self) CHECK_METHODS = [ Option._check_action, Option._check_type, Option._check_choice, Option._check_dest, _check_const, Option._check_nargs, Option._check_callback, ] parser = OptionParser(option_class=InvenioOption, description="Invenio configuration and administration CLI tool", formatter=IndentedHelpFormatter(max_help_position=31)) parser.add_option("-V", "--version", action="store_true", help="print version number") finish_options = OptionGroup(parser, "Options to finish your installation") finish_options.add_option("", "--create-apache-conf", dest='actions', const='create-apache-conf', action="append_const", help="create Apache configuration files") finish_options.add_option("", "--create-tables", dest='actions', const='create-tables', action="append_const", help="create DB tables for Invenio") finish_options.add_option("", "--load-bibfield-conf", dest='actions', const='load-bibfield-conf', action="append_const", help="load bibfield configuration file") finish_options.add_option("", "--load-webstat-conf", dest='actions', const='load-webstat-conf', action="append_const", help="load the WebStat configuration") finish_options.add_option("", "--drop-tables", dest='actions', const='drop-tables', action="append_const", help="drop DB tables of Invenio") finish_options.add_option("", "--check-openoffice", dest='actions', const='check-openoffice', action="append_const", help="check for correctly set up of openoffice temporary directory") parser.add_option_group(finish_options) demotest_options = OptionGroup(parser, "Options to set up and test a demo site") demotest_options.add_option("", "--create-demo-site", dest='actions', const='create-demo-site', action="append_const", help="create demo site") demotest_options.add_option("", "--load-demo-records", dest='actions', const='load-demo-records', action="append_const", help="load demo records") demotest_options.add_option("", "--remove-demo-records", dest='actions', const='remove-demo-records', action="append_const", help="remove demo records, keeping demo site") demotest_options.add_option("", "--drop-demo-site", dest='actions', const='drop-demo-site', action="append_const", help="drop demo site configurations too") demotest_options.add_option("", "--run-unit-tests", dest='actions', const='run-unit-tests', action="append_const", help="run unit test suite (needs demo site)") demotest_options.add_option("", "--run-js-unit-tests", dest='actions', const='run-js-unit-tests', action="append_const", help="run JS unit test suite (needs demo site)") demotest_options.add_option("", "--run-regression-tests", dest='actions', const='run-regression-tests', action="append_const", help="run regression test suite (needs demo site)") demotest_options.add_option("", "--run-web-tests", dest='actions', const='run-web-tests', action="append_const", help="run web tests in a browser (needs demo site, Firefox, Selenium IDE)") parser.add_option_group(demotest_options) config_options = OptionGroup(parser, "Options to update config files in situ") config_options.add_option("", "--update-all", dest='actions', const='update-all', action="append_const", help="perform all the update options") config_options.add_option("", "--update-config-py", dest='actions', const='update-config-py', action="append_const", help="update config.py file from invenio.conf file") config_options.add_option("", "--update-dbquery-py", dest='actions', const='update-dbquery-py', action="append_const", help="update dbquery.py with DB credentials from invenio.conf") config_options.add_option("", "--update-dbexec", dest='actions', const='update-dbexec', action="append_const", help="update dbexec with DB credentials from invenio.conf") config_options.add_option("", "--update-bibconvert-tpl", dest='actions', const='update-bibconvert-tpl', action="append_const", help="update bibconvert templates with CFG_SITE_URL from invenio.conf") config_options.add_option("", "--update-web-tests", dest='actions', const='update-web-tests', action="append_const", help="update web test cases with CFG_SITE_URL from invenio.conf") parser.add_option_group(config_options) reset_options = OptionGroup(parser, "Options to update DB tables") reset_options.add_option("", "--reset-all", dest='actions', const='reset-all', action="append_const", help="perform all the reset options") reset_options.add_option("", "--reset-sitename", dest='actions', const='reset-sitename', action="append_const", help="reset tables to take account of new CFG_SITE_NAME*") reset_options.add_option("", "--reset-siteadminemail", dest='actions', const='reset-siteadminemail', action="append_const", help="reset tables to take account of new CFG_SITE_ADMIN_EMAIL") reset_options.add_option("", "--reset-fieldnames", dest='actions', const='reset-fieldnames', action="append_const", help="reset tables to take account of new I18N names from PO files") reset_options.add_option("", "--reset-recstruct-cache", dest='actions', const='reset-recstruct-cache', action="append_const", help="reset record structure cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE") reset_options.add_option("", "--reset-recjson-cache", dest='actions', const='reset-recjson-cache', action="append_const", help="reset record json structure cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE") parser.add_option_group(reset_options) upgrade_options = OptionGroup(parser, "Options to upgrade your installation") upgrade_options.add_option("", "--upgrade", dest='actions', const='upgrade', action="append_const", help="apply all pending upgrades") upgrade_options.add_option("", "--upgrade-check", dest='actions', const='upgrade-check', action="append_const", help="run pre-upgrade checks for pending upgrades") upgrade_options.add_option("", "--upgrade-show-pending", dest='actions', const='upgrade-show-pending', action="append_const", help="show pending upgrades") upgrade_options.add_option("", "--upgrade-show-applied", dest='actions', const='upgrade-show-applied', action="append_const", help="show history of applied upgrades") upgrade_options.add_option("", "--upgrade-create-standard-recipe", dest='actions', metavar='REPOSITORY[,DIR]', const='upgrade-create-standard-recipe', action="store_append_const", help="create a new standard upgrade recipe (for developers)") upgrade_options.add_option("", "--upgrade-create-release-recipe", dest='actions', metavar='REPOSITORY[,DIR]', const='upgrade-create-release-recipe', action="store_append_const", help="create a new release upgrade recipe (for developers)") parser.add_option_group(upgrade_options) helper_options = OptionGroup(parser, "Options to help the work") helper_options.add_option("", "--list", dest='actions', const='list', action="append_const", help="print names and values of all options from conf files") helper_options.add_option("", "--get", dest='actions', const='get', action="store_append_const", metavar="OPTION", help="get value of a given option from conf files") helper_options.add_option("", "--conf-dir", action="store", metavar="PATH", help="path to directory where invenio*.conf files are [optional]") helper_options.add_option("", "--detect-system-details", dest='actions', const='detect-system-details', action="append_const", help="print system details such as Apache/Python/MySQL versions") parser.add_option_group(helper_options) parser.add_option('--yes-i-know', action='store_true', dest='yes-i-know', help='use with care!') parser.add_option('-x', '--stop', action='store_true', dest='stop_on_error', help='When running tests, stop at first error') return parser def prepare_conf(options): """ Read configuration files """ conf = ConfigParser() confdir = getattr(options, 'conf_dir', None) if confdir is None: ## try to detect path to conf dir (relative to this bin dir): confdir = re.sub(r'/bin$', '/etc', sys.path[0]) if confdir and not os.path.exists(confdir): raise Exception("ERROR: bad --conf-dir option value - directory does not exists.") sys.exit(1) ## read conf files: for conffile in [confdir + os.sep + 'invenio.conf', confdir + os.sep + 'invenio-autotools.conf', confdir + os.sep + 'invenio-local.conf', ]: if os.path.exists(conffile): conf.read(conffile) else: if not conffile.endswith("invenio-local.conf"): # invenio-local.conf is optional, otherwise stop raise Exception("ERROR: Badly guessed conf file location %s (Please use --conf-dir option.)" % conffile) return conf def main(*cmd_args): """Main entry point.""" # Allow easier testing if not cmd_args: cmd_args = sys.argv[1:] # Parse arguments parser = prepare_option_parser() (options, dummy_args) = parser.parse_args(list(cmd_args)) if getattr(options, 'stop_on_error', False): from invenio.testutils import wrap_failfast wrap_failfast() if getattr(options, 'version', False): print_version() else: # Read configuration try: conf = prepare_conf(options) except Exception, e: print e sys.exit(1) ## Decide what to do actions = getattr(options, 'actions', None) if not actions: print """ERROR: Please specify a command. Please see '--help'.""" sys.exit(1) for action in actions: if action == 'get': cli_cmd_get(conf, getattr(options, 'get', None)) elif action == 'list': cli_cmd_list(conf) elif action == 'detect-system-details': cli_cmd_detect_system_details(conf) elif action == 'create-tables': cli_cmd_create_tables(conf) elif action == 'load-webstat-conf': cli_cmd_load_webstat_conf(conf) elif action == 'drop-tables': cli_cmd_drop_tables(conf) elif action == 'check-openoffice': cli_check_openoffice(conf) elif action == 'load-bibfield-conf': cli_cmd_load_bibfield_config(conf) elif action == 'create-demo-site': cli_cmd_create_demo_site(conf) elif action == 'load-demo-records': cli_cmd_load_demo_records(conf) elif action == 'remove-demo-records': cli_cmd_remove_demo_records(conf) elif action == 'drop-demo-site': cli_cmd_drop_demo_site(conf) elif action == 'run-unit-tests': cli_cmd_run_unit_tests(conf) elif action == 'run-js-unit-tests': cli_cmd_run_js_unit_tests(conf) elif action == 'run-regression-tests': cli_cmd_run_regression_tests(conf) elif action == 'run-web-tests': cli_cmd_run_web_tests(conf) elif action == 'update-all': cli_cmd_update_config_py(conf) cli_cmd_update_dbquery_py(conf) cli_cmd_update_dbexec(conf) cli_cmd_update_bibconvert_tpl(conf) cli_cmd_update_web_tests(conf) elif action == 'update-config-py': cli_cmd_update_config_py(conf) elif action == 'update-dbquery-py': cli_cmd_update_dbquery_py(conf) elif action == 'update-dbexec': cli_cmd_update_dbexec(conf) elif action == 'update-bibconvert-tpl': cli_cmd_update_bibconvert_tpl(conf) elif action == 'update-web-tests': cli_cmd_update_web_tests(conf) elif action == 'reset-all': cli_cmd_reset_sitename(conf) cli_cmd_reset_siteadminemail(conf) cli_cmd_reset_fieldnames(conf) cli_cmd_reset_recstruct_cache(conf) elif action == 'reset-sitename': cli_cmd_reset_sitename(conf) elif action == 'reset-siteadminemail': cli_cmd_reset_siteadminemail(conf) elif action == 'reset-fieldnames': cli_cmd_reset_fieldnames(conf) elif action == 'reset-recstruct-cache': cli_cmd_reset_recstruct_cache(conf) elif action == 'reset-recjson-cache': cli_cmd_reset_recjson_cache(conf) elif action == 'create-apache-conf': cli_cmd_create_apache_conf(conf) elif action == 'upgrade': cli_cmd_upgrade(conf) elif action == 'upgrade-check': cli_cmd_upgrade_check(conf) elif action == 'upgrade-show-pending': cli_cmd_upgrade_show_pending(conf) elif action == 'upgrade-show-applied': cli_cmd_upgrade_show_applied(conf) elif action == 'upgrade-create-standard-recipe': cli_cmd_upgrade_create_standard_recipe(conf, getattr(options, 'upgrade_create_standard_recipe', None)) elif action == 'upgrade-create-release-recipe': cli_cmd_upgrade_create_release_recipe(conf, getattr(options, 'upgrade_create_release_recipe', None)) else: print "ERROR: Unknown command", action sys.exit(1) if __name__ == '__main__': main() diff --git a/modules/webaccess/lib/access_control_firerole.py b/modules/webaccess/lib/access_control_firerole.py index 01e1fb756..efd51a505 100644 --- a/modules/webaccess/lib/access_control_firerole.py +++ b/modules/webaccess/lib/access_control_firerole.py @@ -1,337 +1,344 @@ # This file is part of Invenio. -# Copyright (C) 2007, 2008, 2009, 2010, 2011, 2013 CERN. +# Copyright (C) 2007, 2008, 2009, 2010, 2011, 2013, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Invenio Access Control FireRole.""" __revision__ = "$Id$" __lastupdated__ = """$Date$""" """These functions are for realizing a firewall like role definition for extending webaccess to connect user to roles using every infos about users. """ import re import cPickle from zlib import compress, decompress import sys import time if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 from invenio.access_control_config import InvenioWebAccessFireroleError from invenio.dbquery import run_sql, blob_to_string from invenio.config import CFG_CERN_SITE from invenio.access_control_config import CFG_ACC_EMPTY_ROLE_DEFINITION_SRC, \ CFG_ACC_EMPTY_ROLE_DEFINITION_SER, CFG_ACC_EMPTY_ROLE_DEFINITION_OBJ from invenio.errorlib import register_exception # INTERFACE def compile_role_definition(firerole_def_src): """ Given a text in which every row contains a rule it returns the compiled object definition. Rules have the following syntax: allow|deny [not] field {list of one or more (double)quoted string or regexp} or allow|deny any Every row may contain a # sign followed by a comment which are discarded. Field could be any key contained in a user_info dictionary. If the key does not exist in the dictionary, the rule is skipped. The first rule which matches return. """ line = 0 ret = [] default_allow_p = False if not firerole_def_src or not firerole_def_src.strip(): firerole_def_src = CFG_ACC_EMPTY_ROLE_DEFINITION_SRC for row in firerole_def_src.split('\n'): line += 1 row = row.strip() if not row: continue clean_row = _no_comment_re.sub('', row) if clean_row: g = _any_rule_re.match(clean_row) if g: default_allow_p = g.group('command').lower() == 'allow' break g = _rule_re.match(clean_row) if g: allow_p = g.group('command').lower() == 'allow' not_p = g.group('not') != None field = g.group('field').lower() # Renaming groups to group for alias_item in _aliasTable: if field in alias_item: field = alias_item[0] break if field.startswith('precached_'): raise InvenioWebAccessFireroleError("Error while compiling rule %s (line %s): %s is a reserved key and can not be used in FireRole rules!" % (row, line, field)) expressions = g.group('expression')+g.group('more_expressions') expressions_list = [] for expr in _expressions_re.finditer(expressions): expr = expr.group() if field in ('from', 'until'): try: expressions_list.append((False, time.mktime(time.strptime(expr[1:-1], '%Y-%m-%d')))) except Exception, msg: raise InvenioWebAccessFireroleError("Syntax error while compiling rule %s (line %s): %s is not a valid date with format YYYY-MM-DD because %s!" % (row, line, expr, msg)) elif expr[0] == '/': try: expressions_list.append((True, re.compile(expr[1:-1], re.I))) except Exception, msg: raise InvenioWebAccessFireroleError("Syntax error while compiling rule %s (line %s): %s is not a valid re because %s!" % (row, line, expr, msg)) else: if field == 'remote_ip' and '/' in expr[1:-1]: try: expressions_list.append((False, _ip_matcher_builder(expr[1:-1]))) except Exception, msg: raise InvenioWebAccessFireroleError("Syntax error while compiling rule %s (line %s): %s is not a valid ip group because %s!" % (row, line, expr, msg)) else: expressions_list.append((False, expr[1:-1])) expressions_list = tuple(expressions_list) if field in ('from', 'until'): if len(expressions_list) != 1: raise InvenioWebAccessFireroleError("Error when compiling rule %s (line %s): exactly one date is expected when using 'from' or 'until', but %s were found" % (row, line, len(expressions_list))) if not_p: raise InvenioWebAccessFireroleError("Error when compiling rule %s (line %s): 'not' is not allowed when using 'from' or 'until'" % (row, line)) ret.append((allow_p, not_p, field, expressions_list)) else: raise InvenioWebAccessFireroleError("Syntax error while compiling rule %s (line %s): not a valid rule!" % (row, line)) return (default_allow_p, tuple(ret)) def repair_role_definitions(): """ Try to rebuild compiled serialized definitions from their respectives sources. This is needed in case Python break back compatibility. """ definitions = run_sql("SELECT id, firerole_def_src FROM accROLE") for role_id, firerole_def_src in definitions: - run_sql("UPDATE accROLE SET firerole_def_ser=%s WHERE id=%s", (serialize(compile_role_definition(firerole_def_src)), role_id)) + firerole_def_ser = serialize(compile_role_definition(firerole_def_src)) + if firerole_def_ser: + run_sql("UPDATE accROLE SET firerole_def_ser=_binary %s WHERE id=%s", (firerole_def_ser, role_id)) + else: + run_sql("UPDATE accROLE SET firerole_def_ser=%s WHERE id=%s", (firerole_def_ser, role_id)) def store_role_definition(role_id, firerole_def_ser, firerole_def_src): """ Store a compiled serialized definition and its source in the database alongside the role to which it belong. @param role_id: the role_id @param firerole_def_ser: the serialized compiled definition @param firerole_def_src: the sources from which the definition was taken """ - run_sql("UPDATE accROLE SET firerole_def_ser=%s, firerole_def_src=%s WHERE id=%s", (firerole_def_ser, firerole_def_src, role_id)) + if firerole_def_ser: + run_sql("UPDATE accROLE SET firerole_def_ser=_binary %s, firerole_def_src=%s WHERE id=%s", (firerole_def_ser, firerole_def_src, role_id)) + else: + run_sql("UPDATE accROLE SET firerole_def_ser=%s, firerole_def_src=%s WHERE id=%s", (firerole_def_ser, firerole_def_src, role_id)) def load_role_definition(role_id): """ Load the definition corresponding to a role. If the compiled definition is corrupted it try to repairs definitions from their sources and try again to return the definition. @param role_id: @return: a deserialized compiled role definition """ res = run_sql("SELECT firerole_def_ser FROM accROLE WHERE id=%s", (role_id, ), 1, run_on_slave=True) if res: try: return deserialize(res[0][0]) except Exception: ## Something bad might have happened? (Update of Python?) repair_role_definitions() res = run_sql("SELECT firerole_def_ser FROM accROLE WHERE id=%s", (role_id, ), 1, run_on_slave=True) if res: return deserialize(res[0][0]) return CFG_ACC_EMPTY_ROLE_DEFINITION_OBJ def acc_firerole_extract_emails(firerole_def_obj): """ Best effort function to extract all the possible email addresses authorized by the given firerole. """ authorized_emails = set() try: default_allow_p, rules = firerole_def_obj for (allow_p, not_p, field, expressions_list) in rules: # for every rule if not_p: continue if field == 'group': for reg_p, expr in expressions_list: if reg_p: continue if CFG_CERN_SITE and expr.endswith(' [CERN]'): authorized_emails.add(expr[:-len(' [CERN]')].lower().strip() + '@cern.ch') emails = run_sql("SELECT user.email FROM usergroup JOIN user_usergroup ON usergroup.id=user_usergroup.id_usergroup JOIN user ON user.id=user_usergroup.id_user WHERE usergroup.name=%s", (expr, )) for email in emails: authorized_emails.add(email[0].lower().strip()) elif field == 'email': for reg_p, expr in expressions_list: if reg_p: continue authorized_emails.add(expr.lower().strip()) elif field == 'uid': for reg_p, expr in expressions_list: if reg_p: continue email = run_sql("SELECT email FROM user WHERE id=%s", (expr, )) if email: authorized_emails.add(email[0][0].lower().strip()) return authorized_emails except Exception, msg: raise InvenioWebAccessFireroleError, msg def acc_firerole_check_user(user_info, firerole_def_obj): """ Given a user_info dictionary, it matches the rules inside the deserializez compiled definition in order to discover if the current user match the roles corresponding to this definition. @param user_info: a dict produced by collect_user_info which contains every info about a user @param firerole_def_obj: a compiled deserialized definition produced by compile_role_defintion @return: True if the user match the definition, False otherwise. """ try: default_allow_p, rules = firerole_def_obj for (allow_p, not_p, field, expressions_list) in rules: # for every rule group_p = field == 'group' # Is it related to group? ip_p = field == 'remote_ip' # Is it related to Ips? until_p = field == 'until' # Is it related to dates? from_p = field == 'from' # Idem. next_expr_p = False # Silly flag to break 2 for cycles if not user_info.has_key(field) and not from_p and not until_p: continue for reg_p, expr in expressions_list: # For every element in the rule if group_p: # Special case: groups if reg_p: # When it is a regexp for group in user_info[field]: # iterate over every group if expr.match(group): # if it matches if not_p: # if must not match next_expr_p = True # let's skip to next expr break else: # Ok! return allow_p if next_expr_p: break # I said: let's skip to next rule ;-) elif expr.lower() in [group.lower() for group in user_info[field]]: # Simple expression then just check for expr in groups if not_p: # If expr is in groups then if must not match break # let's skip to next expr else: # Ok! return allow_p elif reg_p: # Not a group, then easier. If it's a regexp if expr.match(user_info[field]): # if it matches if not_p: # If must not match break # Let's skip to next expr else: return allow_p # Ok! elif ip_p and type(expr) == type(()): # If it's just a simple expression but an IP! if _ipmatch(user_info['remote_ip'], expr): # Then if Ip matches if not_p: # If must not match break # let's skip to next expr else: return allow_p # ok! elif until_p: if time.time() <= expr: if allow_p: break else: return False elif allow_p: return False else: break elif from_p: if time.time() >= expr: if allow_p: break else: return False elif allow_p: return False else: break elif expr.lower() == str(user_info[field]).lower(): # Finally the easiest one!! if not_p: # ... break else: # ... return allow_p # ... if not_p and not next_expr_p: # Nothing has matched and we got not return allow_p # Then the whole rule matched! except Exception, msg: register_exception(alert_admin=True) raise InvenioWebAccessFireroleError, msg return default_allow_p # By default we allow ;-) it'an OpenAccess project def serialize(firerole_def_obj): """ Serialize and compress a definition.""" if firerole_def_obj == CFG_ACC_EMPTY_ROLE_DEFINITION_OBJ: return CFG_ACC_EMPTY_ROLE_DEFINITION_SER elif firerole_def_obj: return compress(cPickle.dumps(firerole_def_obj, -1)) else: return CFG_ACC_EMPTY_ROLE_DEFINITION_SER def deserialize(firerole_def_ser): """ Deserialize and decompress a definition.""" if firerole_def_ser: return cPickle.loads(decompress(blob_to_string(firerole_def_ser))) else: return CFG_ACC_EMPTY_ROLE_DEFINITION_OBJ # IMPLEMENTATION # Comment finder _no_comment_re = re.compile(r'[\s]*(?allow|deny)[\s]+(?:(?Pnot)[\s]+)?(?P[\w]+)[\s]+(?P(?([\s]*,[\s]*((?allow|deny)[\s]+(any|all)[\s]*', re.I) # Sub expression finder _expressions_re = re.compile(r'(? group member ? query_group_baskets = """ SELECT share_level FROM user_usergroup AS ug LEFT JOIN usergroup_bskBASKET AS ub ON ug.id_usergroup=ub.id_usergroup WHERE ug.id_user=%s AND ub.id_bskBASKET=%s AND NOT(ub.share_level='NO') AND ug.user_status!=%s """ params_group_baskets = (int(uid), int(bskid), CFG_WEBSESSION_USERGROUP_STATUS['PENDING']) res = run_sql(query_group_baskets, params_group_baskets) group_index = None if res: try: group_index = CFG_WEBBASKET_SHARE_LEVELS_ORDERED.index(res[0][0]) except: return None # public basket ? query_public_baskets = """ SELECT share_level FROM usergroup_bskBASKET WHERE id_usergroup=0 AND id_bskBASKET=%s """ public_index = None res = run_sql(query_public_baskets, (int(bskid),)) if res: try: public_index = CFG_WEBBASKET_SHARE_LEVELS_ORDERED.index(res[0][0]) except: return None if group_index or public_index: if group_index > public_index: return CFG_WEBBASKET_SHARE_LEVELS_ORDERED[group_index] else: return CFG_WEBBASKET_SHARE_LEVELS_ORDERED[public_index] return None ########################### Personal baskets ################################## def get_personal_baskets_info_for_topic(uid, topic): """Return information about every basket that belongs to the given user and topic.""" query = """ SELECT bsk.id, bsk.name, DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'), bsk.nb_views, count(rec.id_bibrec_or_bskEXTREC), DATE_FORMAT(max(rec.date_added), '%%Y-%%m-%%d %%H:%%i:%%s') FROM user_bskBASKET AS ubsk JOIN bskBASKET AS bsk ON bsk.id=ubsk.id_bskBASKET AND bsk.id_owner=%s LEFT JOIN bskREC AS rec ON rec.id_bskBASKET=bsk.id WHERE ubsk.id_user=%s AND ubsk.topic=%s GROUP BY bsk.id ORDER BY bsk.name""" params = (uid, uid, topic) res = run_sql(query, params) return res def get_all_user_personal_basket_ids_by_topic(uid): """For a given user return all their personal basket ids grouped by topic.""" query = """ SELECT ubsk.topic, GROUP_CONCAT(bsk.id) FROM user_bskBASKET AS ubsk JOIN bskBASKET AS bsk ON ubsk.id_bskBASKET=bsk.id AND ubsk.id_user=bsk.id_owner WHERE bsk.id_owner=%s GROUP BY ubsk.topic ORDER BY ubsk.topic""" params = (uid,) res = run_sql(query, params) return res def get_all_personal_baskets_names(uid): """ for a given user, returns every basket he is owner of returns list of tuples: (bskid, bsk_name, topic) """ query = """ SELECT bsk.id, bsk.name, ubsk.topic FROM user_bskBASKET ubsk JOIN bskBASKET bsk ON ubsk.id_bskBASKET=bsk.id AND ubsk.id_user=bsk.id_owner WHERE bsk.id_owner=%s ORDER BY ubsk.topic """ params = (int(uid),) return run_sql(query, params) def get_basket_name(bskid): """return the name of a given basket""" query = 'SELECT name FROM bskBASKET where id=%s' res = run_sql(query, (int(bskid), )) if res: return res[0][0] else: return '' def is_personal_basket_valid(uid, bskid): """Check if the basked (bskid) belongs to user (uid) and is valid.""" query = """ SELECT id FROM bskBASKET WHERE id=%s AND id_owner=%s""" params = (bskid, uid) res = run_sql(query, params) return res def is_topic_valid(uid, topic): """Check if the topic defined by user (uid) exists.""" query = """ SELECT distinct(topic) FROM user_bskBASKET WHERE topic=%s AND id_user=%s""" params = (topic, uid) res = run_sql(query, params) return res def get_basket_topic(uid, bskid): """Return the name of the topic this basket (bskid) belongs to.""" query = """ SELECT topic FROM user_bskBASKET WHERE id_bskBASKET=%s AND id_user=%s""" params = (bskid,uid) res = run_sql(query, params) return res def get_personal_topics_infos(uid): """ Get the list of every topic user has defined, and the number of baskets in each topic @param uid: user id (int) @return: a list of tuples (topic name, nb of baskets) """ query = """SELECT topic, count(b.id) FROM user_bskBASKET ub JOIN bskBASKET b ON ub.id_bskBASKET=b.id AND b.id_owner=ub.id_user WHERE ub.id_user=%s GROUP BY topic ORDER BY topic""" uid = int(uid) res = run_sql(query, (uid,)) return res def get_basket_ids_and_names(bskids, limit=0): """For the given basket ids, return their ids and names, ordered by basket name. If 'limit' is greater than 0, limit the number of results returned.""" if not((type(bskids) is list) or (type(bskids) is tuple)): bskids = [bskids] query = """ SELECT bsk.id, bsk.name FROM bskBASKET AS bsk WHERE %s ORDER BY bsk.name %s""" sep = ' OR ' query %= (sep.join(['id=%s'] * len(bskids)), limit and 'LIMIT %i' % limit or '') params = tuple(bskids) res = run_sql(query, params) return res def rename_basket(bskid, new_name): """Rename basket to new_name""" run_sql("UPDATE bskBASKET SET name=%s WHERE id=%s", (new_name, bskid)) def rename_topic(uid, old_topic, new_topic): """Rename topic to new_topic """ res = run_sql("UPDATE user_bskBASKET SET topic=%s WHERE id_user=%s AND topic=%s", (new_topic, uid, old_topic)) return res def move_baskets_to_topic(uid, bskids, new_topic): """Move given baskets to another topic""" if not((type(bskids) is list) or (type(bskids) is tuple)): bskids = [bskids] query = "UPDATE user_bskBASKET SET topic=%s WHERE id_user=%s AND (" query += ' OR '.join(['id_bskBASKET=%s'] * len(bskids)) query += ")" params = (new_topic, uid) + tuple(bskids) res = run_sql(query, params) return res def delete_basket(bskid): """Delete given basket.""" # TODO: check if any alerts are automaticly adding items to the given basket. bskid = int(bskid) query1 = "DELETE FROM bskBASKET WHERE id=%s" res = run_sql(query1, (bskid,)) query2A = "SELECT id_bibrec_or_bskEXTREC FROM bskREC WHERE id_bskBASKET=%s" local_and_external_ids = run_sql(query2A, (bskid,)) external_ids = [local_and_external_id[0] for local_and_external_id in \ local_and_external_ids if local_and_external_id[0]<0] for external_id in external_ids: delete_item(bskid=bskid, recid=external_id, update_date_modification=False) query2B = "DELETE FROM bskREC WHERE id_bskBASKET=%s" run_sql(query2B, (bskid,)) query3 = "DELETE FROM bskRECORDCOMMENT WHERE id_bskBASKET=%s" run_sql(query3, (bskid,)) query4 = "DELETE FROM user_bskBASKET WHERE id_bskBASKET=%s" run_sql(query4, (bskid,)) query5 = "DELETE FROM usergroup_bskBASKET WHERE id_bskBASKET=%s" run_sql(query5, (bskid,)) query6 = "DELETE FROM user_query_basket WHERE id_basket=%s" run_sql(query6, (bskid,)) return int(res) def create_basket(uid, basket_name, topic): """Create new basket for given user in given topic""" now = convert_datestruct_to_datetext(localtime()) id_bsk = run_sql("""INSERT INTO bskBASKET (id_owner, name, date_modification) VALUES (%s, %s, %s)""", (uid, basket_name, now)) run_sql("""INSERT INTO user_bskBASKET (id_user, id_bskBASKET, topic) VALUES (%s, %s, %s)""", (uid, id_bsk, topic)) return id_bsk def get_all_items_in_user_personal_baskets(uid, topic="", format='hb'): """For the specified user, return all the items in their personal baskets, grouped by basket if local or as a list if external. If topic is set, return only that topic's items.""" if topic: topic_clause = """AND ubsk.topic=%s""" params_local = (uid, uid, topic) params_external = (uid, uid, topic, format) else: topic_clause = "" params_local = (uid, uid) params_external = (uid, uid, format) query_local = """ SELECT rec.id_bskBASKET, bsk.name, ubsk.topic, GROUP_CONCAT(rec.id_bibrec_or_bskEXTREC) FROM bskREC AS rec JOIN bskBASKET AS bsk ON bsk.id=rec.id_bskBASKET AND bsk.id_owner=%%s JOIN user_bskBASKET AS ubsk ON ubsk.id_bskBASKET=rec.id_bskBASKET AND ubsk.id_user=%%s %s WHERE rec.id_bibrec_or_bskEXTREC > 0 GROUP BY rec.id_bskBASKET""" % (topic_clause,) res_local = run_sql(query_local, params_local) query_external = """ SELECT rec.id_bskBASKET, bsk.name, ubsk.topic, rec.id_bibrec_or_bskEXTREC, ext.value FROM bskREC AS rec JOIN bskBASKET AS bsk ON bsk.id=rec.id_bskBASKET AND bsk.id_owner=%%s JOIN user_bskBASKET AS ubsk ON ubsk.id_bskBASKET=rec.id_bskBASKET AND ubsk.id_user=%%s %s JOIN bskEXTFMT AS ext ON ext.id_bskEXTREC=-rec.id_bibrec_or_bskEXTREC AND ext.format=%%s WHERE rec.id_bibrec_or_bskEXTREC < 0 ORDER BY rec.id_bskBASKET""" % (topic_clause,) res_external = run_sql(query_external, params_external) return (res_local, res_external) def get_all_items_in_user_personal_baskets_by_matching_notes(uid, topic="", p=""): """For the specified user, return all the items in their personal baskets matching their notes' titles and bodies, grouped by basket. If topic is set, return only that topic's items.""" p = p and '%' + p + '%' or '%' if topic: topic_clause = """AND ubsk.topic=%s""" params = (uid, uid, topic, p, p) else: topic_clause = "" params = (uid, uid, p, p) query = """ SELECT notes.id_bskBASKET, bsk.name, ubsk.topic, GROUP_CONCAT(DISTINCT(notes.id_bibrec_or_bskEXTREC)) FROM bskRECORDCOMMENT AS notes JOIN bskBASKET AS bsk ON bsk.id=notes.id_bskBASKET AND bsk.id_owner=%%s JOIN user_bskBASKET AS ubsk ON ubsk.id_bskBASKET=notes.id_bskBASKET AND ubsk.id_user=%%s %s WHERE notes.title like %%s OR notes.body like %%s GROUP BY notes.id_bskBASKET""" % (topic_clause,) res = run_sql(query, params) return res def get_all_user_topics(uid): """Return a list of the user's topics.""" query = """ SELECT ubsk.topic FROM bskBASKET AS bsk JOIN user_bskBASKET AS ubsk ON ubsk.id_bskBASKET=bsk.id AND ubsk.id_user=bsk.id_owner WHERE bsk.id_owner=%s GROUP BY ubsk.topic""" params = (uid,) res = run_sql(query, params) return res ########################## Actions on baskets ################################# def get_basket_record(bskid, recid, format='hb'): """get record recid in basket bskid """ if recid < 0: rec_table = 'bskEXTREC' format_table = 'bskEXTFMT' id_field = 'id_bskEXTREC' sign = '-' else: rec_table = 'bibrec' format_table = 'bibfmt' id_field = 'id_bibrec' sign = '' query = """ SELECT DATE_FORMAT(record.creation_date, '%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s'), DATE_FORMAT(record.modification_date, '%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s'), DATE_FORMAT(bskREC.date_added, '%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s'), user.nickname, count(cmt.id_bibrec_or_bskEXTREC), DATE_FORMAT(max(cmt.date_creation), '%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s'), fmt.value FROM bskREC LEFT JOIN user ON bskREC.id_user_who_added_item=user.id LEFT JOIN bskRECORDCOMMENT cmt ON bskREC.id_bibrec_or_bskEXTREC=cmt.id_bibrec_or_bskEXTREC LEFT JOIN %(rec_table)s record ON (%(sign)sbskREC.id_bibrec_or_bskEXTREC=record.id) LEFT JOIN %(format_table)s fmt ON (record.id=fmt.%(id_field)s) WHERE bskREC.id_bskBASKET=%%s AND bskREC.id_bibrec_or_bskEXTREC=%%s AND fmt.format=%%s GROUP BY bskREC.id_bibrec_or_bskEXTREC """ % {'rec_table': rec_table, 'sign': sign, 'format_table': format_table, 'id_field':id_field} params = (int(bskid), int(recid), format) res = run_sql(query, params) if res: return __decompress_last(res[0]) return () def get_basket_content(bskid, format='hb'): """Get all records for a given basket.""" query = """ SELECT rec.id_bibrec_or_bskEXTREC, extrec.collection_id, count(cmt.id_bibrec_or_bskEXTREC), DATE_FORMAT(max(cmt.date_creation), '%%Y-%%m-%%d %%H:%%i:%%s'), extern.value as ext_val, intern.value as int_val, rec.score FROM bskREC AS rec LEFT JOIN bskRECORDCOMMENT AS cmt ON (rec.id_bibrec_or_bskEXTREC=cmt.id_bibrec_or_bskEXTREC AND rec.id_bskBASKET=cmt.id_bskBASKET) LEFT JOIN bskEXTFMT AS extern ON (-rec.id_bibrec_or_bskEXTREC=extern.id_bskEXTREC AND extern.format=%s) LEFT JOIN bibfmt AS intern ON (rec.id_bibrec_or_bskEXTREC=intern.id_bibrec AND intern.format=%s) LEFT JOIN bskEXTREC AS extrec ON extrec.id=-rec.id_bibrec_or_bskEXTREC WHERE rec.id_bskBASKET=%s GROUP BY rec.id_bibrec_or_bskEXTREC ORDER BY rec.score""" params = (format, format, int(bskid)) res = run_sql(query, params) if res: query2 = "UPDATE bskBASKET SET nb_views=nb_views+1 WHERE id=%s" run_sql(query2, (int(bskid),)) return res return () def get_basket_item(bskid, recid, format='hb'): """Get item (recid) for a given basket.""" query = """ SELECT rec.id_bibrec_or_bskEXTREC, extrec.collection_id, count(cmt.id_bibrec_or_bskEXTREC), DATE_FORMAT(max(cmt.date_creation), '%%Y-%%m-%%d %%H:%%i:%%s'), extern.value as ext_val, intern.value as int_val, rec.score FROM bskREC rec LEFT JOIN bskRECORDCOMMENT cmt ON (rec.id_bibrec_or_bskEXTREC=cmt.id_bibrec_or_bskEXTREC AND rec.id_bskBASKET=cmt.id_bskBASKET) LEFT JOIN bskEXTFMT extern ON (-rec.id_bibrec_or_bskEXTREC=extern.id_bskEXTREC AND extern.format=%s) LEFT JOIN bibfmt intern ON (rec.id_bibrec_or_bskEXTREC=intern.id_bibrec AND intern.format=%s) LEFT JOIN bskEXTREC AS extrec ON extrec.id=-rec.id_bibrec_or_bskEXTREC WHERE rec.id_bskBASKET=%s AND rec.id_bibrec_or_bskEXTREC=%s GROUP BY rec.id_bibrec_or_bskEXTREC ORDER BY rec.score""" params = (format, format, bskid, recid) res = run_sql(query, params) if res: queryU = """UPDATE bskBASKET SET nb_views=nb_views+1 WHERE id=%s""" paramsU = (bskid,) run_sql(queryU, paramsU) score = res[0][6] query_previous = """SELECT id_bibrec_or_bskEXTREC FROM bskREC WHERE id_bskBASKET=%s AND score<%s ORDER BY score DESC LIMIT 1""" params_previous = (bskid, score) res_previous = run_sql(query_previous, params_previous) query_next = """SELECT id_bibrec_or_bskEXTREC FROM bskREC WHERE id_bskBASKET=%s AND score>%s ORDER BY score ASC LIMIT 1""" params_next = (bskid, score) res_next = run_sql(query_next, params_next) query_index = """ SELECT COUNT(id_bibrec_or_bskEXTREC) FROM bskREC WHERE id_bskBASKET=%s AND score<=%s ORDER BY score""" params_index = (bskid, score) res_index = run_sql(query_index, params_index) res_index = __wash_sql_count(res_index) return (res[0], res_previous and res_previous[0][0] or 0, res_next and res_next[0][0] or 0, res_index) else: return () def get_basket_item_title_and_URL(recid): """ Retrieves the title and URL for the specified item in the specified basket. @param bskid: The basked id @type bskid: int @param recid: The record (item) id @type recid: int @return: A tuple containing the title as a sting and the URL as a string. """ if recid > 0: # This is a local record, we can easily retrieve the title using the # search engine's get_fieldvalues function and the MARC field and tag. title_list = get_fieldvalues(recid, '245___') # Check if the main title is always the first element in the list if title_list: title = title_list[0] else: title = "" url = '%s/record/%i' % (CFG_SITE_URL, recid) elif recid < 0: # This is an external record or item, use title = "This is an external record or item." url = '%s' % (CFG_SITE_URL,) query = """ SELECT rec.collection_id, rec.original_url, fmt.value FROM bskEXTREC as rec, bskEXTFMT as fmt WHERE rec.id=%s AND fmt.id_bskEXTREC=%s AND fmt.format='hb'""" params = (-recid, -recid) result = run_sql(query, params) if result: item = __decompress_last(result[0]) collection = item[0] url = item[1] hb = item[2] if collection == 0: # This is an external item title = hb.split('\n',1)[0] elif collection > 0: # This is an external record from a hosted collection title = hb.split('',1)[0].split('')[-1] return (title, url) def share_basket_with_group(bskid, group_id, share_level=CFG_WEBBASKET_SHARE_LEVELS['READITM']): """ Share basket bskid with group group_id with given share_level @param share_level: see CFG_WEBBASKET_SHARE_LEVELS in webbasket_config """ now = convert_datestruct_to_datetext(localtime()) run_sql("""REPLACE INTO usergroup_bskBASKET (id_usergroup, id_bskBASKET, date_shared, share_level) VALUES (%s,%s,%s,%s)""", (group_id, bskid, now, str(share_level))) def update_rights(bskid, group_rights): """update rights (permissions) for groups. @param bskid: basket id @param group_rights: dictionary of {group id: new rights} """ now = convert_datestruct_to_datetext(localtime()) query1 = """REPLACE INTO usergroup_bskBASKET (id_usergroup, id_bskBASKET, date_shared, share_level) VALUES """ + \ ', '.join(["(%s, %s, %s, %s)"] * len(group_rights.items())) params = () for (group_id, share_level) in group_rights.items(): params += (int(group_id), int(bskid), now, str(share_level)) run_sql(query1, params) query2 = """DELETE FROM usergroup_bskBASKET WHERE share_level='NO'""" run_sql(query2) def move_item(bskid, recid, direction): """Change score of an item in a basket""" bskid = int(bskid) query1 = """SELECT id_bibrec_or_bskEXTREC, score FROM bskREC WHERE id_bskBASKET=%s ORDER BY score, date_added""" items = run_sql(query1, (bskid,)) (recids, scores) = zip(*items) (recids, scores) = (list(recids), list(scores)) if len(recids) and recid in recids: current_index = recids.index(recid) if direction == CFG_WEBBASKET_ACTIONS['UP']: switch_index = 0 if current_index != 0: switch_index = current_index -1 else: switch_index = len(recids) - 1 if current_index != len(recids)-1: switch_index = current_index + 1 query2 = """UPDATE bskREC SET score=%s WHERE id_bskBASKET=%s AND id_bibrec_or_bskEXTREC=%s""" res1 = run_sql(query2, (scores[switch_index], bskid, recids[current_index])) res2 = run_sql(query2, (scores[current_index], bskid, recids[switch_index])) if res1 and res2: now = convert_datestruct_to_datetext(localtime()) query3 = "UPDATE bskBASKET SET date_modification=%s WHERE id=%s" params3 = (now, int(bskid)) run_sql(query3, params3) def delete_item(bskid, recid, update_date_modification=True): """Remove item recid from basket bskid""" if recid < 0: query0A = "select count(id_bskBASKET) from bskREC where id_bibrec_or_bskEXTREC=%s" % (int(recid)) ncopies = run_sql(query0A) if ncopies and ncopies[0][0]<=1: # uncomment the following 5 lines and comment the following 2 to delete cached records # only for external sources and not for external records #query0B = "SELECT collection_id FROM bskEXTREC WHERE id=%s" % (-int(recid)) #colid = run_sql(query0B) #if colid and colid[0][0]==0: #query0C = "DELETE from bskEXTFMT WHERE id_bskEXTREC=%s" % (-int(recid)) #run_sql(query0C) # the following two lines delete cached external records. We could keep them if we find # a way to reuse them in case the external records are added again in the future. query0D = "DELETE from bskEXTFMT WHERE id_bskEXTREC=%s" % (-int(recid)) run_sql(query0D) query0E = "DELETE from bskEXTREC WHERE id=%s" % (-int(recid)) run_sql(query0E) query_notes = "DELETE FROM bskRECORDCOMMENT WHERE id_bskBASKET=%s AND id_bibrec_or_bskEXTREC=%s" run_sql(query_notes, (bskid, recid,)) query1 = "DELETE from bskREC WHERE id_bskBASKET=%s AND id_bibrec_or_bskEXTREC=%s" params1 = (int(bskid), int(recid)) res = run_sql(query1, params1) if update_date_modification and res: now = convert_datestruct_to_datetext(localtime()) query2 = "UPDATE bskBASKET SET date_modification=%s WHERE id=%s" params2 = (now, int(bskid)) run_sql(query2, params2) return res def add_to_basket(uid, recids=[], colid=0, bskid=0, es_title="", es_desc="", es_url=""): """Add items (recids) basket (bskid).""" if (recids or (colid == -1 and es_title and es_desc and es_url)) and bskid > 0: query_max_score = """ SELECT MAX(score) FROM bskREC WHERE id_bskBASKET=%s""" params_max_score = (bskid,) res_max_score = run_sql(query_max_score, params_max_score) max_score = __wash_sql_count(res_max_score) if not max_score: # max_score == None actually means that the basket doesn't exist. # Maybe we should return 0 and inform the admin? max_score = 1 if colid > 0: query_existing = """ SELECT id, external_id FROM bskEXTREC WHERE %s AND collection_id=%s""" sep_or = ' OR ' query_existing %= (sep_or.join(['external_id=%s'] * len(recids)), colid) params_existing = tuple(recids) res_existing = run_sql(query_existing, params_existing) existing_recids = [int(external_ids_couple[1]) for external_ids_couple in res_existing] existing_ids = [int(ids[0]) for ids in res_existing] new_recids = [recid for recid in recids if int(recid) not in existing_recids] # sets approach #existing_recids = [ids[1] for ids in res_existing] #new_recids = list(set(recids)-set(existing_recids)) if new_recids: query_new = """ INSERT INTO bskEXTREC (external_id, collection_id, creation_date, modification_date) VALUES """ now = convert_datestruct_to_datetext(localtime()) records = ["(%s, %s, %s, %s)"] * len(new_recids) query_new += ', '.join(records) params_new = () for new_recid in new_recids: params_new += (int(new_recid), colid, now, now) res_new = run_sql(query_new, params_new) recids = [-int(recid) for recid in existing_ids] recids.extend(range(-res_new,-(res_new+len(new_recids)),-1)) else: recids = [-int(recid) for recid in existing_ids] elif colid < 0: query_external = """INSERT INTO bskEXTREC (collection_id, original_url, creation_date, modification_date) VALUES (%s, %s, %s, %s)""" now = convert_datestruct_to_datetext(localtime()) params_external = (colid, es_url, now, now) res_external = run_sql(query_external, params_external) recids = [-res_external] store_external_source(res_external, es_title, es_desc, es_url, 'xm') store_external_source(res_external, es_title, es_desc, es_url, 'hb') query_insert = """ INSERT IGNORE INTO bskREC (id_bibrec_or_bskEXTREC, id_bskBASKET, id_user_who_added_item, date_added, score) VALUES """ if colid == 0 or (colid > 0 and not new_recids): now = convert_datestruct_to_datetext(localtime()) records = ["(%s, %s, %s, %s, %s)"] * len(recids) query_insert += ', '.join(records) params_insert = () i = 1 for recid in recids: params_insert += (recid, bskid, uid, now, max_score + i) i += 1 run_sql(query_insert, params_insert) query_update = """ UPDATE bskBASKET SET date_modification=%s WHERE id=%s""" params_update = (now, bskid) run_sql(query_update, params_update) return recids return 0 def move_to_basket(uid, recids=None, old_bskid=0, new_bskid=0, update_date_modification=True): """ Move items (recids) from basket (old_bskid) to basket (new_bskid) """ if (recids is not None) and len(recids) > 0: moved_recids = [] for recid in recids: # Prevent duplication of items query = """ SELECT '1' FROM bskREC WHERE id_bskBASKET=%s AND id_bibrec_or_bskEXTREC=%s """ params = (int(new_bskid), int(recid)) res = run_sql(query, params) if len(res) == 0: # Change the item's pointer to basket query = """ UPDATE bskREC SET id_bskBASKET=%s, id_user_who_added_item=%s WHERE id_bskBASKET=%s AND id_bibrec_or_bskEXTREC=%s """ params = (int(new_bskid), int(uid), int(old_bskid), int(recid)) res = run_sql(query, params) moved_recids.append(int(recid)) # Update 'modification date' if len(moved_recids) > 0 and update_date_modification: now = convert_datestruct_to_datetext(localtime()) query = "UPDATE bskBASKET SET date_modification=%s WHERE id=%s" params = (now, int(old_bskid)) run_sql(query, params) params = (now, int(new_bskid)) run_sql(query, params) return moved_recids def add_to_many_baskets(uid, recids=[], colid=0, bskids=[], es_title="", es_desc="", es_url=""): """Add items recids to every basket in bskids list.""" if (len(recids) or colid == -1) and len(bskids): query1 = """SELECT id_bskBASKET, max(score) FROM bskREC WHERE %s GROUP BY id_bskBASKET""" bskids = [bskid for bskid in bskids if int(bskid) >= 0] sep_or = ' OR ' query1 %= sep_or.join(['id_bskBASKET=%s'] * len(bskids)) bsks = dict.fromkeys(bskids, 0) params = tuple(bskids) bsks.update(dict(run_sql(query1, params))) if colid > 0: query2A = """SELECT id, external_id FROM bskEXTREC WHERE %s AND collection_id=%s""" query2A %= (sep_or.join(['external_id=%s'] * len(recids)), colid) params2A = tuple(recids) res2A = run_sql(query2A, params2A) existing_recids = [int(external_ids_couple[1]) for external_ids_couple in res2A] existing_ids = [int(ids[0]) for ids in res2A] new_recids = [recid for recid in recids if int(recid) not in existing_recids] # sets approach #existing_recids = [ids[1] for ids in res2A] #new_recids = list(set(recids)-set(existing_recids)) if new_recids: query2B = """INSERT INTO bskEXTREC (external_id, collection_id, creation_date, modification_date) VALUES """ now = convert_datestruct_to_datetext(localtime()) records = ["(%s, %s, %s, %s)"] * len(new_recids) query2B += ', '.join(records) params2B = () for new_recid in new_recids: params2B += (int(new_recid), colid, now, now) res = run_sql(query2B, params2B) recids = [-int(recid) for recid in existing_ids] recids.extend(range(-res,-(res+len(new_recids)),-1)) else: recids = [-int(recid) for recid in existing_ids] elif colid < 0: query2C = """INSERT INTO bskEXTREC (collection_id, original_url, creation_date, modification_date) VALUES (%s, %s, %s, %s)""" now = convert_datestruct_to_datetext(localtime()) params = (colid, es_url, now, now) res = run_sql(query2C, params) recids = [-res] store_external_source(res, es_title, es_desc, es_url, 'xm') store_external_source(res, es_title, es_desc, es_url, 'hb') query2 = """INSERT IGNORE INTO bskREC (id_bibrec_or_bskEXTREC, id_bskBASKET, id_user_who_added_item, date_added, score) VALUES """ if colid == 0 or (colid > 0 and not new_recids): now = convert_datestruct_to_datetext(localtime()) records = ["(%s, %s, %s, %s, %s)"] * (len(recids) * len(bsks.items())) query2 += ', '.join(records) params = () for (bskid, max_score) in bsks.items(): i = 1 for recid in recids: params += (int(recid), int(bskid), int(uid), now, int(max_score) + i) i += 1 run_sql(query2, params) query3 = """UPDATE bskBASKET SET date_modification=%s WHERE """ query3 += sep_or.join(["id=%s"] * len(bskids)) params = (now,) + tuple(bskids) run_sql(query3, params) return len(bskids) return 0 def get_external_records_by_collection(recids): """Get the selected recids, both local and external, grouped by collection.""" if recids: query = """ SELECT GROUP_CONCAT(id), GROUP_CONCAT(external_id), collection_id FROM bskEXTREC WHERE %s GROUP BY collection_id""" recids = [-recid for recid in recids] sep_or = ' OR ' query %= sep_or.join(['id=%s'] * len(recids)) params = tuple(recids) res = run_sql(query,params) return res return 0 def get_external_records(recids, of="hb"): """Get formatted external records from the database.""" if recids: query = """ SELECT rec.collection_id, fmt.id_bskEXTREC, fmt.value FROM bskEXTFMT AS fmt JOIN bskEXTREC AS rec ON rec.id=fmt.id_bskEXTREC WHERE format=%%s AND ( %s )""" recids = [-recid for recid in recids] sep_or = ' OR ' query %= sep_or.join(['id_bskEXTREC=%s'] * len(recids)) params = [of] params.extend(recids) params = tuple(params) res = run_sql(query,params) return res return () def store_external_records(records, of="hb"): """Store formatted external records to the database.""" if records: query = """INSERT INTO bskEXTFMT (id_bskEXTREC, format, last_updated, value) VALUES """ now = convert_datestruct_to_datetext(localtime()) formatted_records = ["(%s, %s, %s, %s)"] * len(records) query += ', '.join(formatted_records) params = () for record in records: params += (record[0], of, now, compress(record[1])) run_sql(query,params) def store_external_urls(ids_urls): """Store original urls for external records to the database.""" #for id_url in ids_urls.iteritems(): for id_url in ids_urls: query = """UPDATE bskEXTREC SET original_url=%s WHERE id=%s""" params = (id_url[1], id_url[0]) run_sql(query,params) def store_external_source(es_id, es_title, es_desc, es_url, of="hb"): """Store formatted external sources to the database.""" if es_id and es_title and es_desc: query = """INSERT INTO bskEXTFMT (id_bskEXTREC, format, last_updated, value) VALUES (%s, %s, %s, %s)""" now = convert_datestruct_to_datetext(localtime()) value = create_pseudo_record(es_title, es_desc, es_url, of) params = (es_id, of, now, compress(value)) run_sql(query,params) def get_external_colid_and_url(recid): """Get the collection id and original url for an external record.""" if recid: query = """SELECT collection_id, original_url FROM bskEXTREC WHERE id=%s""" params = (-recid,) res = run_sql(query,params) if res: return res else: return 0 ############################ Group baskets #################################### def get_group_baskets_info_for_group(grpid): """Return information about every basket that belongs to the given group, provided the user is its manager or a member of it.""" if not grpid: return () query = """ SELECT bsk.id, bsk.name, DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'), bsk.nb_views, COUNT(rec.id_bibrec_or_bskEXTREC), DATE_FORMAT(max(rec.date_added), '%%Y-%%m-%%d %%H:%%i:%%s'), ugbsk.share_level, bsk.id_owner FROM usergroup_bskBASKET AS ugbsk JOIN bskBASKET AS bsk ON bsk.id=ugbsk.id_bskBASKET LEFT JOIN bskREC AS rec ON rec.id_bskBASKET=bsk.id WHERE ugbsk.id_usergroup=%s AND ugbsk.share_level!='NO' GROUP BY bsk.id ORDER BY bsk.name""" params = (grpid,) res = run_sql(query, params) return res def get_group_name(gid): """Given its id return the group's name.""" query = """ SELECT name FROM usergroup WHERE id=%s""" params = (gid,) res = run_sql(query, params) return res def get_all_user_group_basket_ids_by_group(uid): """For a given user return all their group basket ids grouped by group.""" query = """ SELECT ug.id, ug.name, GROUP_CONCAT(ugbsk.id_bskBASKET) FROM usergroup AS ug JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_usergroup=ug.id JOIN bskBASKET AS bsk ON ugbsk.id_bskBASKET=bsk.id JOIN user_usergroup AS uug ON ug.id=uug.id_usergroup AND uug.id_user=%s GROUP BY ug.name ORDER BY ug.name""" params = (uid,) res = run_sql(query, params) return res def get_all_user_group_basket_ids_by_group_with_add_rights(uid): """For a given user return all their group basket ids grouped by group. Return only the basket ids to which it is allowed to add records.""" query = """ SELECT ug.name, GROUP_CONCAT(ugbsk.id_bskBASKET) FROM usergroup AS ug JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_usergroup=ug.id AND ugbsk.share_level!='NO' AND ugbsk.share_level!='RI' AND ugbsk.share_level!='RC' AND ugbsk.share_level!='AC' JOIN bskBASKET AS bsk ON ugbsk.id_bskBASKET=bsk.id JOIN user_usergroup AS uug ON ug.id=uug.id_usergroup AND uug.id_user=%s GROUP BY ug.name ORDER BY ug.name""" params = (uid,) res = run_sql(query, params) return res def get_all_group_baskets_names(uid, min_rights=CFG_WEBBASKET_SHARE_LEVELS['ADDCMT']): """For a given user returns every group baskets in which he can return a list of tuples: (bskid, bsk_name, group_name).""" # TODO: This function is no longer used. Delete if necessary. uid = int(uid) try: min_rights_num = CFG_WEBBASKET_SHARE_LEVELS_ORDERED.index(min_rights) except ValueError: return () groups = get_groups_user_member_of(uid) if groups: where_clause = '(' where_clause += " OR ".join(["ugbsk.id_usergroup=%s"] * len(groups)) where_clause += ') AND (' where_clause += " OR ".join(["ugbsk.share_level=%s"] * len(CFG_WEBBASKET_SHARE_LEVELS_ORDERED[min_rights_num:])) where_clause += ")" query = """ SELECT bsk.id, bsk.name, ug.name FROM usergroup ug JOIN usergroup_bskBASKET ugbsk ON ug.id=ugbsk.id_usergroup JOIN bskBASKET bsk ON bsk.id=ugbsk.id_bskBASKET WHERE %s AND NOT(ugbsk.share_level='NO') ORDER BY ug.name""" % where_clause params = tuple([group_id for (group_id, dummy) in groups]) params += tuple(CFG_WEBBASKET_SHARE_LEVELS_ORDERED[min_rights_num:]) return run_sql(query, params) return () def is_shared_to(bskids): """For each bskid in bskids get id of one of its group. Used to make distinction between private basket (no group), 'world' basket (0) or group basket (any int > 0) """ if not((type(bskids) == list) or (type(bskids) == tuple)): bskids = [bskids] query = """SELECT b.id, min(u.id_usergroup) FROM bskBASKET b LEFT JOIN usergroup_bskBASKET u ON (b.id=u.id_bskBASKET) """ if len(bskids) != 0: query += " WHERE " query += " OR ".join(['b.id=%s'] * len(bskids)) query += " GROUP BY b.id" params = tuple(bskids) res = run_sql(query, params) if res: return res return () def get_basket_share_level(bskid): """Get the minimum share level of the basket (bskid). Returns: None for personal baskets positive integet for group baskets 0 for public baskets Will return 0 if the basket is both group and publicly shared.""" query = """ SELECT MIN(ugbsk.id_usergroup) FROM bskBASKET AS bsk LEFT JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=bsk.id WHERE bsk.id=%s GROUP BY bsk.id""" params = (bskid,) res = run_sql(query, params) return res def get_all_items_in_user_group_baskets(uid, group=0, format='hb'): """For the specified user, return all the items in their group baskets, grouped by basket if local or as a list if external. If group is set, return only that group's items.""" if group: group_clause = """AND ugbsk.id_usergroup=%s""" params_local = (group, uid) params_external = (group, uid, format) else: group_clause = "" params_local = (uid,) params_external = (uid, format) query_local = """ SELECT rec.id_bskBASKET, bsk.name, uug.id_usergroup, ug.name, ugbsk.share_level, GROUP_CONCAT(rec.id_bibrec_or_bskEXTREC) FROM bskREC AS rec JOIN bskBASKET AS bsk ON bsk.id=rec.id_bskBASKET JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=rec.id_bskBASKET %s JOIN user_usergroup AS uug ON uug.id_usergroup=ugbsk.id_usergroup AND uug.id_user=%%s JOIN usergroup AS ug ON ug.id=uug.id_usergroup WHERE rec.id_bibrec_or_bskEXTREC > 0 GROUP BY rec.id_bskBASKET""" % (group_clause,) res_local = run_sql(query_local, params_local) query_external = """ SELECT rec.id_bskBASKET, bsk.name, uug.id_usergroup, ug.name, ugbsk.share_level, rec.id_bibrec_or_bskEXTREC, ext.value FROM bskREC AS rec JOIN bskBASKET AS bsk ON bsk.id=rec.id_bskBASKET JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=rec.id_bskBASKET %s JOIN user_usergroup AS uug ON uug.id_usergroup=ugbsk.id_usergroup AND uug.id_user=%%s JOIN usergroup AS ug ON ug.id=uug.id_usergroup JOIN bskEXTFMT AS ext ON ext.id_bskEXTREC=-rec.id_bibrec_or_bskEXTREC AND ext.format=%%s WHERE rec.id_bibrec_or_bskEXTREC < 0 ORDER BY rec.id_bskBASKET""" % (group_clause,) res_external = run_sql(query_external, params_external) return (res_local, res_external) def get_all_items_in_user_group_baskets_by_matching_notes(uid, group=0, p=""): """For the specified user, return all the items in group personal baskets matching their notes' titles and bodies, grouped by basket. If topic is set, return only that topic's items.""" p = p and '%' + p + '%' or '%' if group: group_clause = """AND ugbsk.id_usergroup=%s""" params = (group, uid, p, p) else: group_clause = "" params = (uid, p, p) query = """ SELECT notes.id_bskBASKET, bsk.name, uug.id_usergroup, ug.name, ugbsk.share_level, GROUP_CONCAT(DISTINCT(notes.id_bibrec_or_bskEXTREC)) FROM bskRECORDCOMMENT AS notes JOIN bskBASKET AS bsk ON bsk.id=notes.id_bskBASKET JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=notes.id_bskBASKET AND ugbsk.share_level IS NOT NULL AND ugbsk.share_level!='NO' AND ugbsk.share_level!='RI' %s JOIN user_usergroup AS uug ON uug.id_usergroup=ugbsk.id_usergroup AND uug.id_user=%%s JOIN usergroup AS ug ON ug.id=uug.id_usergroup WHERE notes.title like %%s OR notes.body like %%s GROUP BY notes.id_bskBASKET""" % (group_clause,) res = run_sql(query, params) return res def is_group_basket_valid(uid, bskid): """Check if the basked (bskid) belongs to one of the groups the user (uid) is a member of and is valid.""" query = """ SELECT id FROM bskBASKET AS bsk JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=bsk.id JOIN user_usergroup AS uug ON uug.id_usergroup=ugbsk.id_usergroup AND uug.id_user=%s WHERE id=%s""" params = (uid, bskid) res = run_sql(query, params) return res def is_group_valid(uid, group): """Check if the group exists and the user is a member or manager.""" query = """ SELECT id_usergroup FROM user_usergroup WHERE id_usergroup=%s AND id_user=%s""" params = (group, uid) res = run_sql(query, params) return res def get_all_user_groups(uid): """Return a list of the groups the user is a member of or manages.""" query = """ SELECT ug.id, ug.name FROM user_usergroup AS uug JOIN usergroup AS ug ON ug.id=uug.id_usergroup JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_usergroup=uug.id_usergroup WHERE uug.id_user=%s GROUP BY uug.id_usergroup""" params = (uid,) res = run_sql(query, params) return res ########################## External baskets ################################### def get_external_baskets_infos(uid): """Get general informations about every external basket user uid has subscribed to.""" query = """ SELECT bsk.id, bsk.name, DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'), bsk.nb_views, count(rec.id_bibrec_or_bskEXTREC), DATE_FORMAT(max(rec.date_added), '%%Y-%%m-%%d %%H:%%i:%%s'), ugbsk.share_level FROM bskBASKET bsk JOIN user_bskBASKET ubsk ON (bsk.id=ubsk.id_bskBASKET AND ubsk.id_user=%s) LEFT JOIN bskREC rec ON (bsk.id=rec.id_bskBASKET) LEFT JOIN usergroup_bskBASKET ugbsk ON (ugbsk.id_bskBASKET=bsk.id AND ugbsk.id_usergroup=0) WHERE bsk.id_owner!=%s GROUP BY bsk.id """ uid = int(uid) params = (uid, uid) res = run_sql(query, params) if res: return res return () def get_external_basket_info(bskid): """""" query = """ SELECT bsk.id, bsk.name, DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'), bsk.nb_views, count(rec.id_bibrec_or_bskEXTREC), DATE_FORMAT(max(rec.date_added), '%%Y-%%m-%%d %%H:%%i:%%s'), ugbsk.share_level FROM bskBASKET AS bsk LEFT JOIN bskREC AS rec ON bsk.id=rec.id_bskBASKET JOIN usergroup_bskBASKET AS ugbsk ON bsk.id=ugbsk.id_bskBASKET AND ugbsk.id_usergroup=0 WHERE id=%s""" params = (bskid,) res = run_sql(query, params) return res def get_all_external_basket_ids_and_names(uid): """For a given user return all their external baskets (in tuples: (id, name, number_of_records)).""" query = """ SELECT bsk.id, bsk.name, count(rec.id_bibrec_or_bskEXTREC), ugbsk.id_usergroup FROM user_bskBASKET AS ubsk JOIN bskBASKET AS bsk ON ubsk.id_bskBASKET=bsk.id AND ubsk.id_user!=bsk.id_owner LEFT JOIN bskREC AS rec ON ubsk.id_bskBASKET=rec.id_bskBASKET LEFT JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_usergroup=0 AND ugbsk.id_bskBASKET=bsk.id WHERE ubsk.id_user=%s GROUP BY bsk.id ORDER BY bsk.name""" params = (uid,) res = run_sql(query, params) return res def count_external_baskets(uid): """Returns the number of external baskets the user is subscribed to.""" query = """ SELECT COUNT(ubsk.id_bskBASKET) FROM user_bskBASKET ubsk LEFT JOIN bskBASKET bsk ON (bsk.id=ubsk.id_bskBASKET AND ubsk.id_user=%s) WHERE bsk.id_owner!=%s""" params = (int(uid), int(uid)) res = run_sql(query, params) return __wash_sql_count(res) def get_all_external_baskets_names(uid, min_rights=CFG_WEBBASKET_SHARE_LEVELS['ADDCMT']): """ for a given user returns every basket which he has subscribed to and in which he can return a list of tuples: (bskid, bsk_name) """ uid = int(uid) try: min_rights_num = CFG_WEBBASKET_SHARE_LEVELS_ORDERED.index(min_rights) except ValueError: return () where_clause = ' AND (' for right in CFG_WEBBASKET_SHARE_LEVELS_ORDERED[min_rights_num:-1]: where_clause += "ugbsk.share_level = '%s' OR " % right where_clause += "ugbsk.share_level = '%s')" % CFG_WEBBASKET_SHARE_LEVELS_ORDERED[-1] query = """ SELECT bsk.id, bsk.name FROM bskBASKET bsk JOIN usergroup_bskBASKET ugbsk ON bsk.id=ugbsk.id_bskBASKET JOIN user_bskBASKET ubsk ON ubsk.id_bskBASKET=bsk.id WHERE ugbsk.id_usergroup=0 AND ubsk.id_user=%s AND NOT(bsk.id_owner=%s) AND NOT(ugbsk.share_level='NO') """ + where_clause params = (uid, uid) return run_sql(query, params) def get_all_items_in_user_public_baskets(uid, format='hb'): """For the specified user, return all the items in the public baskets they are subscribed to, grouped by basket if local or as a list if external.""" query_local = """ SELECT rec.id_bskBASKET, bsk.name, ugbsk.share_level, GROUP_CONCAT(rec.id_bibrec_or_bskEXTREC) FROM bskREC AS rec JOIN bskBASKET AS bsk ON bsk.id=rec.id_bskBASKET AND bsk.id_owner!=%s JOIN user_bskBASKET AS ubsk ON ubsk.id_bskBASKET=rec.id_bskBASKET AND ubsk.id_user=%s JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=rec.id_bskBASKET AND ugbsk.id_usergroup=0 WHERE rec.id_bibrec_or_bskEXTREC > 0 GROUP BY rec.id_bskBASKET""" params_local = (uid, uid) res_local = run_sql(query_local, params_local) query_external = """ SELECT rec.id_bskBASKET, bsk.name, ugbsk.share_level, rec.id_bibrec_or_bskEXTREC, ext.value FROM bskREC AS rec JOIN bskBASKET AS bsk ON bsk.id=rec.id_bskBASKET AND bsk.id_owner!=%s JOIN user_bskBASKET AS ubsk ON ubsk.id_bskBASKET=rec.id_bskBASKET AND ubsk.id_user=%s JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=rec.id_bskBASKET AND ugbsk.id_usergroup=0 JOIN bskEXTFMT AS ext ON ext.id_bskEXTREC=-rec.id_bibrec_or_bskEXTREC AND ext.format=%s WHERE rec.id_bibrec_or_bskEXTREC < 0 ORDER BY rec.id_bskBASKET""" params_external = (uid, uid, format) res_external = run_sql(query_external, params_external) return (res_local, res_external) def get_all_items_in_user_public_baskets_by_matching_notes(uid, p=""): """For the specified user, return all the items in the public baskets they are subscribed to, matching their notes' titles and bodies, grouped by basket""" p = p and '%' + p + '%' or '%' query = """ SELECT notes.id_bskBASKET, bsk.name, ugbsk.share_level, GROUP_CONCAT(DISTINCT(notes.id_bibrec_or_bskEXTREC)) FROM bskRECORDCOMMENT AS notes JOIN bskBASKET AS bsk ON bsk.id=notes.id_bskBASKET AND bsk.id_owner!=%s JOIN user_bskBASKET AS ubsk ON ubsk.id_bskBASKET=notes.id_bskBASKET AND ubsk.id_user=%s JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=notes.id_bskBASKET AND ugbsk.id_usergroup=0 AND ugbsk.share_level IS NOT NULL AND ugbsk.share_level!='NO' AND ugbsk.share_level!='RI' WHERE notes.title like %s OR notes.body like %s GROUP BY notes.id_bskBASKET""" params = (uid, uid, p, p) res = run_sql(query, params) return res def get_all_items_in_all_public_baskets(format='hb'): """Return all the items in all the public baskets, grouped by basket if local or as a list if external.""" query_local = """ SELECT rec.id_bskBASKET, bsk.name, ugbsk.share_level, GROUP_CONCAT(rec.id_bibrec_or_bskEXTREC) FROM bskREC AS rec JOIN bskBASKET AS bsk ON bsk.id=rec.id_bskBASKET JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=rec.id_bskBASKET AND ugbsk.id_usergroup=0 WHERE rec.id_bibrec_or_bskEXTREC > 0 GROUP BY rec.id_bskBASKET""" res_local = run_sql(query_local) query_external = """ SELECT rec.id_bskBASKET, bsk.name, ugbsk.share_level, rec.id_bibrec_or_bskEXTREC, ext.value FROM bskREC AS rec JOIN bskBASKET AS bsk ON bsk.id=rec.id_bskBASKET JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=rec.id_bskBASKET AND ugbsk.id_usergroup=0 JOIN bskEXTFMT AS ext ON ext.id_bskEXTREC=-rec.id_bibrec_or_bskEXTREC AND ext.format=%s WHERE rec.id_bibrec_or_bskEXTREC < 0 ORDER BY rec.id_bskBASKET""" params_external = (format,) res_external = run_sql(query_external, params_external) return (res_local, res_external) def get_all_items_in_all_public_baskets_by_matching_notes(p=""): """Return all the items in all the public baskets matching their notes' titles and bodies, grouped by basket""" p = p and '%' + p + '%' or '%' query = """ SELECT notes.id_bskBASKET, bsk.name, ugbsk.share_level, GROUP_CONCAT(DISTINCT(notes.id_bibrec_or_bskEXTREC)) FROM bskRECORDCOMMENT AS notes JOIN bskBASKET AS bsk ON bsk.id=notes.id_bskBASKET JOIN usergroup_bskBASKET AS ugbsk ON ugbsk.id_bskBASKET=notes.id_bskBASKET AND ugbsk.id_usergroup=0 AND ugbsk.share_level IS NOT NULL AND ugbsk.share_level!='NO' AND ugbsk.share_level!='RI' WHERE notes.title like %s OR notes.body like %s GROUP BY notes.id_bskBASKET""" params = (p, p) res = run_sql(query, params) return res ############################ Public access #################################### def get_public_basket_infos(bskid): """return (id, name, date modification, nb of views, id of owner, nickname of owner, rights for public access) for a given basket""" basket = [] query1 = """SELECT bsk.id, bsk.name, DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'), bsk.nb_views, bsk.id_owner, user.nickname FROM bskBASKET bsk LEFT JOIN user ON bsk.id_owner=user.id WHERE bsk.id=%s""" res1 = run_sql(query1, (int(bskid),)) if len(res1): basket = list(res1[0]) query2 = """SELECT share_level FROM usergroup_bskBASKET WHERE id_usergroup=0 and id_bskBASKET=%s""" res2 = run_sql(query2, (int(bskid),)) if res2: basket.append(res2[0][0]) else: basket.append(None) return basket def get_public_basket_info(bskid): """Return information about a given public basket.""" query = """ SELECT bsk.id, bsk.name, bsk.id_owner, DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'), bsk.nb_views, COUNT(rec.id_bibrec_or_bskEXTREC), GROUP_CONCAT(rec.id_bibrec_or_bskEXTREC), ubsk.share_level FROM bskBASKET AS bsk LEFT JOIN bskREC AS rec ON rec.id_bskBASKET=bsk.id JOIN usergroup_bskBASKET AS ubsk ON ubsk.id_bskBASKET=bsk.id AND ubsk.id_usergroup=0 WHERE bsk.id=%s GROUP BY bsk.id;""" params = (bskid,) res = run_sql(query, params) return res def get_basket_general_infos(bskid): """return information about a basket, suited for public access. @return: a (id, name, date of modification, nb of views, nb of records, id of owner) tuple """ query = """SELECT bsk.id, bsk.name, DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'), bsk.nb_views, count(rec.id_bibrec_or_bskEXTREC), bsk.id_owner FROM bskBASKET bsk LEFT JOIN bskREC rec ON bsk.id=rec.id_bskBASKET WHERE bsk.id=%s GROUP BY bsk.id""" res = run_sql(query, (int(bskid),)) if res: query2 = "UPDATE bskBASKET SET nb_views=nb_views+1 WHERE id=%s" run_sql(query2, (int(bskid),)) return res[0] return () def get_basket_owner_id(bskid): """Return the uid of the owner.""" query = """SELECT id_owner FROM bskBASKET WHERE id=%s""" res = run_sql(query, (bskid, )) if res: return res[0][0] return -1 def count_public_baskets(): """Returns the number of public baskets.""" query = """ SELECT COUNT(id_bskBASKET) FROM usergroup_bskBASKET WHERE id_usergroup=0""" res = run_sql(query) return __wash_sql_count(res) def get_public_baskets_list(inf_limit, max_number, order=1, asc=1): """Return list of public baskets @param inf_limit: limit to baskets from number x @param max_number: number of baskets to return @order: 1: order by name of basket, 2: number of views, 3: owner @return: [(basket id, basket name, nb of views, uid of owner, nickname of owner)]""" query = """SELECT bsk.id, bsk.name, bsk.nb_views, u.id, u.nickname FROM bskBASKET bsk LEFT JOIN usergroup_bskBASKET ugbsk on bsk.id=ugbsk.id_bskBASKET LEFT JOIN user u on bsk.id_owner=u.id WHERE ugbsk.id_usergroup=0 """ if order == 2: query += 'ORDER BY bsk.nb_views' elif order == 3: query += 'ORDER BY u.nickname' if asc: query += ' ASC' else: query += ' DESC' query += ', u.id' else: query += 'ORDER BY bsk.name' if asc: query += ' ASC ' else: query += ' DESC ' query += "LIMIT %s,%s" return run_sql(query, (inf_limit, max_number)) def count_all_public_baskets(): """Return the number of all the public baskets.""" query = """ SELECT count(id_bskBASKET) FROM usergroup_bskBASKET WHERE id_usergroup=0""" res = run_sql(query) return __wash_sql_count(res) def get_list_public_baskets(page, max_number, sort='name', asc=1): """Return list of public baskets @param page: limit to baskets from number x @param max_number: maximum number of baskets to return @sort: 1: order by name of basket, 2: number of views, 3: owner @return: [(basket id, basket name, nb of views, uid of owner, nickname of owner)]""" query = """ SELECT bsk.id, bsk.name, bsk.id_owner, u.nickname, DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'), COUNT(rec.id_bibrec_or_bskEXTREC) AS items, bsk.nb_views FROM usergroup_bskBASKET AS ugbsk JOIN bskBASKET AS bsk ON bsk.id=ugbsk.id_bskBASKET LEFT JOIN bskREC AS rec ON rec.id_bskBASKET=bsk.id LEFT JOIN user AS u ON u.id=bsk.id_owner WHERE ugbsk.id_usergroup=0 GROUP BY bsk.id""" if sort == 'name': query += """ ORDER BY bsk.name""" elif sort == 'owner': query += """ ORDER BY u.nickname""" elif sort == 'views': query += """ ORDER BY bsk.nb_views""" elif sort == 'date': query += """ ORDER BY bsk.date_modification""" elif sort == 'items': query += """ ORDER BY items""" else: query += """ ORDER BY bsk.name""" if asc: query += """ ASC""" if sort == """owner""": query += """, u.id""" else: query += """ DESC""" if sort == """owner""": query += """, u.id""" query += """ LIMIT %s, %s""" page = max(0, page) res = run_sql(query, (page, max_number)) return res def is_basket_public(bskid): """Check if the given basket is public. Returns ((0,),) if False, ((1,),) if True.""" query = """ SELECT COUNT(*) FROM usergroup_bskBASKET WHERE id_usergroup=0 AND id_bskBASKET=%s""" params = (bskid,) res = run_sql(query, params) return __wash_sql_count(res) def subscribe(uid, bskid): """Subscribe the given user to the given basket.""" query1 = """SELECT COUNT(*) FROM user_bskBASKET WHERE id_user=%s AND id_bskBASKET=%s""" params1 = (uid, bskid) res1 = run_sql(query1, params1) if res1[0][0]: # The user is either the owner of the basket or is already subscribed. return False else: query2 = """INSERT INTO user_bskBASKET (id_user, id_bskBASKET) VALUES (%s, %s)""" params2 = (uid, bskid) run_sql(query2, params2) return True def unsubscribe(uid, bskid): """Unsubscribe the given user from the given basket.""" query1 = """SELECT COUNT(*) FROM bskBASKET WHERE id_owner=%s AND id=%s""" params1 = (uid, bskid) res1 = run_sql(query1, params1) if res1[0][0]: # The user is the owner of the basket. return False else: query2 = """DELETE FROM user_bskBASKET WHERE id_user=%s AND id_bskBASKET=%s""" params2 = (uid, bskid) res2 = run_sql(query2, params2) if res2: return True else: return False def is_user_subscribed_to_basket(uid, bskid): """Return ((1,),) if the user is subscribed to the given basket or ((0,),) if the user is not subscribed or is the owner of the basket.""" query = """ SELECT COUNT(ubsk.id_bskBASKET) FROM user_bskBASKET AS ubsk JOIN bskBASKET AS bsk ON bsk.id=ubsk.id_bskBASKET AND bsk.id_owner!=ubsk.id_user WHERE ubsk.id_user=%s AND ubsk.id_bskBASKET=%s""" params = (uid, bskid) res = run_sql(query, params) return __wash_sql_count(res) def count_subscribers(uid, bskid): """Returns a (number of users, number of groups, number of alerts) tuple for the given user (uid) and basket (bskid).""" uid = int(uid) bskid = int(bskid) query_groups = """ SELECT count(id_usergroup) FROM usergroup_bskBASKET WHERE id_bskBASKET=%s AND NOT(share_level='NO') GROUP BY id_bskBASKET""" params_groups = (bskid,) res_groups = run_sql(query_groups, params_groups) nb_groups = __wash_sql_count(res_groups) query_users = """ SELECT count(id_user) FROM user_bskBASKET WHERE id_bskBASKET=%s AND id_user!=%s GROUP BY id_bskBASKET""" params_users = (bskid, uid) res_users = run_sql(query_users, params_users) nb_users = __wash_sql_count(res_users) query_alerts = """ SELECT count(id_query) FROM user_query_basket WHERE id_basket=%s GROUP BY id_basket""" params_alerts = (bskid,) res_alerts = run_sql(query_alerts, params_alerts) nb_alerts = __wash_sql_count(res_alerts) return (nb_users, nb_groups, nb_alerts) def get_groups_subscribing_to_basket(bskid): """ get list of (group id, group name, rights) tuples for a given basket Please note that group 0 is used to mean everybody. """ query = """SELECT ugb.id_usergroup, ug.name, ugb.share_level FROM usergroup_bskBASKET ugb LEFT JOIN usergroup ug ON ugb.id_usergroup=ug.id WHERE ugb.id_bskBASKET=%s ORDER BY ugb.id_usergroup""" return run_sql(query, (int(bskid),)) def get_rights_on_public_basket(bskid): """""" query = """ SELECT share_level FROM usergroup_bskBASKET WHERE id_usergroup=0 AND id_bskBASKET=%s""" params = (bskid,) res = run_sql(query, params) return res def count_public_basket_subscribers(bskid): """Return the number of users subscribed to the given public basket.""" query = """ SELECT COUNT(ubsk.id_user) FROM user_bskBASKET AS ubsk JOIN bskBASKET AS bsk ON bsk.id=ubsk.id_bskBASKET AND bsk.id_owner!=ubsk.id_user WHERE ubsk.id_bskBASKET=%s""" params = (bskid,) res = run_sql(query, params) return __wash_sql_count(res) ################################ Notes ######################################## def get_notes(bskid, recid): """Return all comments for record recid in basket bskid.""" query = """ SELECT user.id, user.nickname, bskcmt.title, bskcmt.body, DATE_FORMAT(bskcmt.date_creation, '%%Y-%%m-%%d %%H:%%i:%%s'), bskcmt.priority, bskcmt.id, bskcmt.in_reply_to_id_bskRECORDCOMMENT FROM bskRECORDCOMMENT bskcmt LEFT JOIN user ON (bskcmt.id_user=user.id) WHERE bskcmt.id_bskBASKET=%s AND bskcmt.id_bibrec_or_bskEXTREC=%s ORDER BY bskcmt.reply_order_cached_data """ bskid = int(bskid) recid = int(recid) res = run_sql(query, (bskid, recid)) if res: return res else: return () def get_note(cmtid): """Return comment cmtid as a (author's nickname, author's uid, title, body, date of creation, priority) tuple""" out = () query = """ SELECT user.nickname, user.id, bskcmt.title, bskcmt.body, DATE_FORMAT(bskcmt.date_creation, '%%Y-%%m-%%d %%H:%%i:%%s'), bskcmt.priority FROM bskRECORDCOMMENT bskcmt LEFT JOIN user ON (bskcmt.id_user=user.id) WHERE bskcmt.id=%s """ cmtid = int(cmtid) res = run_sql(query, (cmtid,)) if res: return res[0] return out def save_note(uid, bskid, recid, title, body, date_creation=None, reply_to=None): """Save then given note (title, body) on the given item in the given basket. @param date_creation: date in which the note was created @type date_creation: None or String, e.g: '2011-07-04 14:20:57' Note: convert_datestruct_to_datetext((2005, 11, 16, 15, 11, 44, 2, 320, 0)) -> '2005-11-16 15:11:44' """ if reply_to and CFG_WEBBASKET_MAX_COMMENT_THREAD_DEPTH >= 0: # Check that we have not reached max depth note_ancestors = get_note_ancestors(reply_to) if len(note_ancestors) >= CFG_WEBBASKET_MAX_COMMENT_THREAD_DEPTH: if CFG_WEBBASKET_MAX_COMMENT_THREAD_DEPTH == 0: reply_to = None else: reply_to = note_ancestors[CFG_WEBBASKET_MAX_COMMENT_THREAD_DEPTH - 1] if not date_creation: date = convert_datestruct_to_datetext(localtime()) else: #the date comes with the proper format date = date_creation res = run_sql("""INSERT INTO bskRECORDCOMMENT (id_user, id_bskBASKET, id_bibrec_or_bskEXTREC, title, body, date_creation, in_reply_to_id_bskRECORDCOMMENT) VALUES (%s, %s, %s, %s, %s, %s, %s)""", (int(uid), int(bskid), int(recid), title, body, date, reply_to or 0)) if res: new_comid = int(res) parent_reply_order = run_sql("""SELECT reply_order_cached_data from bskRECORDCOMMENT where id=%s""", (reply_to,)) if not parent_reply_order or parent_reply_order[0][0] is None: parent_reply_order = '' else: parent_reply_order = parent_reply_order[0][0] - run_sql("""UPDATE bskRECORDCOMMENT SET reply_order_cached_data=%s WHERE id=%s""", + run_sql("""UPDATE bskRECORDCOMMENT SET reply_order_cached_data=_binary %s WHERE id=%s""", (parent_reply_order + get_reply_order_cache_data(new_comid), new_comid)) return int(res) return 0 def delete_note(bskid, recid, cmtid): """Delete a comment on an item of a basket""" query = """ DELETE FROM bskRECORDCOMMENT WHERE id_bskBASKET=%s AND id_bibrec_or_bskEXTREC=%s AND id=%s""" params = (int(bskid), int(recid), int(cmtid)) run_sql(query, params) def get_note_ancestors(cmtid, depth=None): """ Returns the list of ancestors of the given note, ordered from oldest to newest ("top-down": direct parent of cmtid is at last position), up to given depth @param cmtid: the ID of the note for which we want to retrieve ancestors @type cmtid: int @param depth: the maximum of levels up from the given note we want to retrieve ancestors. None for no limit, 1 for direct parent only, etc. @type depth: int @return the list of ancestors @rtype: list """ if depth == 0: return [] res = run_sql("SELECT in_reply_to_id_bskRECORDCOMMENT FROM bskRECORDCOMMENT WHERE id=%s", (cmtid,)) if res: parent_cmtid = res[0][0] if parent_cmtid == 0: return [] parent_ancestors = [] if depth: depth -= 1 parent_ancestors = get_note_ancestors(parent_cmtid, depth) parent_ancestors.append(parent_cmtid) return parent_ancestors else: return [] def note_belongs_to_item_in_basket_p(cmtid, recid, bskid): """Returns 1 (True) if the given note (cmtid) belongs to the given item (recid) and the given basket (bskid) or 0 (False).""" query = """ SELECT COUNT(*) FROM bskRECORDCOMMENT WHERE id=%s AND id_bibrec_or_bskEXTREC=%s AND id_bskBASKET=%s""" params = (cmtid, recid, bskid) res = run_sql(query, params) return __wash_sql_count(res) def get_number_of_notes_per_record_in_basket(bskid, recids): """Returns the number of comments per record for all the given records in the given basket""" # We need to convert the list of recids into a string of commma separated # numbers (recids), instead of a tuple, to cover the case where we have # single element lists of recids. Example: # [1] --> '1' instaed of [1] --> (1,) # Single element tuples would cause the query to fail due to the syntax. query = """ SELECT rec.id_bibrec_or_bskEXTREC, COUNT(cmt.id_bibrec_or_bskEXTREC) FROM bskREC as rec LEFT JOIN bskRECORDCOMMENT as cmt ON cmt.id_bibrec_or_bskEXTREC = rec.id_bibrec_or_bskEXTREC WHERE rec.id_bskBASKET=%%s AND rec.id_bibrec_or_bskEXTREC IN (%s) GROUP BY id_bibrec_or_bskEXTREC ORDER BY rec.score""" % (str(map(int, recids))[1:-1],) params = (bskid,) result = run_sql(query, params) return result ########################## Usergroup functions ################################ def get_group_infos(uid): """For each group the user with uid is a member of return the id, name and number of baskets.""" query = """SELECT g.id, g.name, count(ugb.id_bskBASKET) FROM usergroup g LEFT JOIN (user_usergroup ug, usergroup_bskBASKET ugb) ON (g.id=ug.id_usergroup AND g.id=ugb.id_usergroup) WHERE ug.id_user=%s AND NOT(ugb.share_level='NO') AND ug.user_status!=%s GROUP BY g.id ORDER BY g.name""" params = (int(uid), CFG_WEBSESSION_USERGROUP_STATUS['PENDING']) res = run_sql(query, params) return res def count_groups_user_member_of(uid): """Returns the number of groups the user has joined.""" query = """ SELECT COUNT(id_usergroup) FROM user_usergroup WHERE id_user=%s AND user_status!=%s""" params = (int(uid), CFG_WEBSESSION_USERGROUP_STATUS['PENDING']) res = run_sql(query, params) return __wash_sql_count(res) def get_groups_user_member_of(uid): """ Get uids and names of groups user is member of. @param uid: user id (int) @return: a tuple of (group_id, group_name) tuples """ query = """ SELECT g.id, g.name FROM usergroup g JOIN user_usergroup ug ON (g.id=ug.id_usergroup) WHERE ug.id_user=%s and ug.user_status!=%s ORDER BY g.name """ params = (int(uid), CFG_WEBSESSION_USERGROUP_STATUS['PENDING']) res = run_sql(query, params) if res: return res return () ########################## auxilliary functions ############################### def __wash_sql_count(res): """Wash the result of SQL COUNT function and return only an integer.""" if res: return res[0][0] return 0 def __decompress_last(item): """private function, used to shorten code""" item = list(item) item[-1] = decompress(item[-1]) return item def create_pseudo_record(es_title, es_desc, es_url, of="hb"): """Return a pseudo record representation given a title and a description.""" if of == 'hb': record = '\n'.join([es_title, es_desc, es_url]) if of == 'xm': # In case we want to use the controlfield, # the -es_id must be used. #%s record = """ %s %s %s """ % (encode_for_xml(es_title), encode_for_xml(es_desc), encode_for_xml(es_url)) return record def prettify_url(url, char_limit=50, nb_dots=3): """If the url has more characters than char_limit return a shortened version of it keeping the beginning and ending and replacing the rest with dots.""" if len(url) > char_limit: # let's set a minimum character limit if char_limit < 5: char_limit = 5 # let's set a maximum number of dots in relation to the character limit if nb_dots > char_limit/4: nb_dots = char_limit/5 nb_char_url = char_limit - nb_dots nb_char_end = nb_char_url/4 nb_char_beg = nb_char_url - nb_char_end return url[:nb_char_beg] + '.'*nb_dots + url[-nb_char_end:] else: return url diff --git a/modules/webcomment/lib/webcomment.py b/modules/webcomment/lib/webcomment.py index b3f1fabbd..374f186c6 100644 --- a/modules/webcomment/lib/webcomment.py +++ b/modules/webcomment/lib/webcomment.py @@ -1,2039 +1,2039 @@ # -*- coding: utf-8 -*- # This file is part of Invenio. -# Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 CERN. +# Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Comments and reviews for records """ __revision__ = "$Id$" # non Invenio imports: import time import math import os import shutil import cgi import re from datetime import datetime, timedelta # Invenio imports: from invenio.dbquery import run_sql from invenio.config import CFG_PREFIX, \ CFG_SITE_LANG, \ CFG_WEBALERT_ALERT_ENGINE_EMAIL,\ CFG_SITE_SUPPORT_EMAIL,\ CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL,\ CFG_SITE_URL,\ CFG_SITE_NAME,\ CFG_WEBCOMMENT_ALLOW_REVIEWS,\ CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS,\ CFG_WEBCOMMENT_ALLOW_COMMENTS,\ CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL,\ CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN,\ CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS,\ CFG_WEBCOMMENT_DEFAULT_MODERATOR, \ CFG_SITE_RECORD, \ CFG_WEBCOMMENT_EMAIL_REPLIES_TO, \ CFG_WEBCOMMENT_ROUND_DATAFIELD, \ CFG_WEBCOMMENT_RESTRICTION_DATAFIELD, \ CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH from invenio.webmessage_mailutils import \ email_quote_txt, \ email_quoted_txt2html from invenio.htmlutils import tidy_html from invenio.webuser import get_user_info, get_email, collect_user_info from invenio.dateutils import convert_datetext_to_dategui, \ datetext_default, \ convert_datestruct_to_datetext from invenio.mailutils import send_email from invenio.errorlib import register_exception from invenio.messages import wash_language, gettext_set_language from invenio.urlutils import wash_url_argument from invenio.webcomment_config import CFG_WEBCOMMENT_ACTION_CODE from invenio.access_control_engine import acc_authorize_action from invenio.search_engine import \ guess_primary_collection_of_a_record, \ check_user_can_view_record, \ get_collection_reclist, \ get_colID from invenio.search_engine_utils import get_fieldvalues from invenio.webcomment_washer import EmailWasher try: import invenio.template webcomment_templates = invenio.template.load('webcomment') except: pass def perform_request_display_comments_or_remarks(req, recID, display_order='od', display_since='all', nb_per_page=100, page=1, ln=CFG_SITE_LANG, voted=-1, reported=-1, subscribed=0, reviews=0, uid=-1, can_send_comments=False, can_attach_files=False, user_is_subscribed_to_discussion=False, user_can_unsubscribe_from_discussion=False, display_comment_rounds=None): """ Returns all the comments (reviews) of a specific internal record or external basket record. @param recID: record id where (internal record IDs > 0) or (external basket record IDs < -100) @param display_order: hh = highest helpful score, review only lh = lowest helpful score, review only hs = highest star score, review only ls = lowest star score, review only od = oldest date nd = newest date @param display_since: all= no filtering by date nd = n days ago nw = n weeks ago nm = n months ago ny = n years ago where n is a single digit integer between 0 and 9 @param nb_per_page: number of results per page @param page: results page @param voted: boolean, active if user voted for a review, see perform_request_vote function @param reported: boolean, active if user reported a certain comment/review, perform_request_report function @param subscribed: int, 1 if user just subscribed to discussion, -1 if unsubscribed @param reviews: boolean, enabled if reviews, disabled for comments @param uid: the id of the user who is reading comments @param can_send_comments: if user can send comment or not @param can_attach_files: if user can attach file to comment or not @param user_is_subscribed_to_discussion: True if user already receives new comments by email @param user_can_unsubscribe_from_discussion: True is user is allowed to unsubscribe from discussion @return html body. """ _ = gettext_set_language(ln) warnings = [] nb_reviews = 0 nb_comments = 0 # wash arguments recID = wash_url_argument(recID, 'int') ln = wash_language(ln) display_order = wash_url_argument(display_order, 'str') display_since = wash_url_argument(display_since, 'str') nb_per_page = wash_url_argument(nb_per_page, 'int') page = wash_url_argument(page, 'int') voted = wash_url_argument(voted, 'int') reported = wash_url_argument(reported, 'int') reviews = wash_url_argument(reviews, 'int') # vital argument check (valid, error_body) = check_recID_is_in_range(recID, warnings, ln) if not(valid): return error_body # CERN hack begins: filter out ATLAS comments from invenio.config import CFG_CERN_SITE if CFG_CERN_SITE: restricted_comments_p = False for report_number in get_fieldvalues(recID, '088__a'): if report_number.startswith("ATL-"): restricted_comments_p = True break if restricted_comments_p: err_code, err_msg = acc_authorize_action(uid, 'viewrestrcoll', collection='ATLAS Communications') if err_code: return err_msg # CERN hack ends # Query the database and filter results user_info = collect_user_info(uid) res = query_retrieve_comments_or_remarks(recID, display_order, display_since, reviews, user_info=user_info) # res2 = query_retrieve_comments_or_remarks(recID, display_order, display_since, not reviews, user_info=user_info) nb_res = len(res) from invenio.webcommentadminlib import get_nb_reviews, get_nb_comments nb_reviews = get_nb_reviews(recID, count_deleted=False) nb_comments = get_nb_comments(recID, count_deleted=False) # checking non vital arguemnts - will be set to default if wrong #if page <= 0 or page.lower() != 'all': if page < 0: page = 1 if nb_per_page < 0: nb_per_page = 100 if CFG_WEBCOMMENT_ALLOW_REVIEWS and reviews: if display_order not in ['od', 'nd', 'hh', 'lh', 'hs', 'ls']: display_order = 'hh' else: if display_order not in ['od', 'nd']: display_order = 'od' if not display_comment_rounds: display_comment_rounds = [] # filter results according to page and number of reults per page if nb_per_page > 0: if nb_res > 0: last_page = int(math.ceil(nb_res / float(nb_per_page))) else: last_page = 1 if page > last_page: page = 1 if nb_res > nb_per_page: # if more than one page of results if page < last_page: res = res[(page-1)*(nb_per_page) : (page*nb_per_page)] else: res = res[(page-1)*(nb_per_page) : ] else: # one page of results pass else: last_page = 1 # Add information regarding visibility of comment for user user_collapsed_comments = get_user_collapsed_comments_for_record(uid, recID) if reviews: res = [row[:] + (row[10] in user_collapsed_comments,) for row in res] else: res = [row[:] + (row[6] in user_collapsed_comments,) for row in res] # Send to template avg_score = 0.0 # comments not allowed by admin if not CFG_WEBCOMMENT_ALLOW_COMMENTS and not CFG_WEBCOMMENT_ALLOW_REVIEWS: body = webcomment_templates.tmpl_error( _('Comments on records have been disallowed by the' ' administrator.'), ln) return body if reported > 0: warnings.append((_('Your feedback has been recorded, many thanks.'), 'green')) elif reported == 0: warnings.append((_('You have already reported an abuse for this' ' comment.'), '')) elif reported == -2: warnings.append((_('The comment you have reported no longer ' 'exists.'), '')) if CFG_WEBCOMMENT_ALLOW_REVIEWS and reviews: avg_score = calculate_avg_score(res) if voted > 0: warnings.append((_('Your feedback has been recorded, many' ' thanks.'), 'green')) elif voted == 0: warnings.append((_('Sorry, you have already voted. This vote has ' 'not been recorded.'), '')) if subscribed == 1: warnings.append( (_('You have been subscribed to this discussion. From now on, you' ' will receive an email whenever a new comment is posted.'), 'green') ) elif subscribed == -1: warnings.append((_('You have been unsubscribed from this discussion.'), 'green')) grouped_comments = group_comments_by_round(res, reviews) # Clean list of comments round names if not display_comment_rounds: display_comment_rounds = [] elif 'all' in display_comment_rounds: display_comment_rounds = [cmtgrp[0] for cmtgrp in grouped_comments] elif 'latest' in display_comment_rounds: if grouped_comments: display_comment_rounds.append(grouped_comments[-1][0]) display_comment_rounds.remove('latest') body = webcomment_templates.tmpl_get_comments(req, recID, ln, nb_per_page, page, last_page, display_order, display_since, CFG_WEBCOMMENT_ALLOW_REVIEWS, grouped_comments, nb_comments, avg_score, warnings, border=0, reviews=reviews, total_nb_reviews=nb_reviews, uid=uid, can_send_comments=can_send_comments, can_attach_files=can_attach_files, user_is_subscribed_to_discussion=\ user_is_subscribed_to_discussion, user_can_unsubscribe_from_discussion=\ user_can_unsubscribe_from_discussion, display_comment_rounds=display_comment_rounds) return body def perform_request_vote(cmt_id, client_ip_address, value, uid=-1): """ Vote positively or negatively for a comment/review @param cmt_id: review id @param value: +1 for voting positively -1 for voting negatively @return: integer 1 if successful, integer 0 if not """ cmt_id = wash_url_argument(cmt_id, 'int') client_ip_address = wash_url_argument(client_ip_address, 'str') value = wash_url_argument(value, 'int') uid = wash_url_argument(uid, 'int') if cmt_id > 0 and value in [-1, 1] and check_user_can_vote(cmt_id, client_ip_address, uid): action_date = convert_datestruct_to_datetext(time.localtime()) action_code = CFG_WEBCOMMENT_ACTION_CODE['VOTE'] query = """INSERT INTO cmtACTIONHISTORY (id_cmtRECORDCOMMENT, id_bibrec, id_user, client_host, action_time, action_code) VALUES (%s, NULL ,%s, inet_aton(%s), %s, %s)""" params = (cmt_id, uid, client_ip_address, action_date, action_code) run_sql(query, params) return query_record_useful_review(cmt_id, value) else: return 0 def check_user_can_comment(recID, client_ip_address, uid=-1): """ Check if a user hasn't already commented within the last seconds time limit: CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS @param recID: record id @param client_ip_address: IP => use: str(req.remote_ip) @param uid: user id, as given by invenio.webuser.getUid(req) """ recID = wash_url_argument(recID, 'int') client_ip_address = wash_url_argument(client_ip_address, 'str') uid = wash_url_argument(uid, 'int') max_action_time = time.time() - CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS max_action_time = convert_datestruct_to_datetext(time.localtime(max_action_time)) action_code = CFG_WEBCOMMENT_ACTION_CODE['ADD_COMMENT'] query = """SELECT id_bibrec FROM cmtACTIONHISTORY WHERE id_bibrec=%s AND action_code=%s AND action_time>%s """ params = (recID, action_code, max_action_time) if uid < 0: query += " AND client_host=inet_aton(%s)" params += (client_ip_address,) else: query += " AND id_user=%s" params += (uid,) res = run_sql(query, params) return len(res) == 0 def check_user_can_review(recID, client_ip_address, uid=-1): """ Check if a user hasn't already reviewed within the last seconds time limit: CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS @param recID: record ID @param client_ip_address: IP => use: str(req.remote_ip) @param uid: user id, as given by invenio.webuser.getUid(req) """ action_code = CFG_WEBCOMMENT_ACTION_CODE['ADD_REVIEW'] query = """SELECT id_bibrec FROM cmtACTIONHISTORY WHERE id_bibrec=%s AND action_code=%s """ params = (recID, action_code) if uid < 0: query += " AND client_host=inet_aton(%s)" params += (client_ip_address,) else: query += " AND id_user=%s" params += (uid,) res = run_sql(query, params) return len(res) == 0 def check_user_can_vote(cmt_id, client_ip_address, uid=-1): """ Checks if a user hasn't already voted @param cmt_id: comment id @param client_ip_address: IP => use: str(req.remote_ip) @param uid: user id, as given by invenio.webuser.getUid(req) """ cmt_id = wash_url_argument(cmt_id, 'int') client_ip_address = wash_url_argument(client_ip_address, 'str') uid = wash_url_argument(uid, 'int') query = """SELECT id_cmtRECORDCOMMENT FROM cmtACTIONHISTORY WHERE id_cmtRECORDCOMMENT=%s""" params = (cmt_id,) if uid < 0: query += " AND client_host=inet_aton(%s)" params += (client_ip_address,) else: query += " AND id_user=%s" params += (uid, ) res = run_sql(query, params) return (len(res) == 0) def get_comment_collection(cmt_id): """ Extract the collection where the comment is written """ query = "SELECT id_bibrec FROM cmtRECORDCOMMENT WHERE id=%s" recid = run_sql(query, (cmt_id,)) record_primary_collection = guess_primary_collection_of_a_record(recid[0][0]) return record_primary_collection def get_collection_moderators(collection): """ Return the list of comment moderators for the given collection. """ from invenio.access_control_engine import acc_get_authorized_emails res = list(acc_get_authorized_emails('moderatecomments', collection=collection)) if not res: return [CFG_WEBCOMMENT_DEFAULT_MODERATOR,] return res def perform_request_report(cmt_id, client_ip_address, uid=-1): """ Report a comment/review for inappropriate content. Will send an email to the administrator if number of reports is a multiple of CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN @param cmt_id: comment id @return: integer 1 if successful, integer 0 if not. -2 if comment does not exist """ cmt_id = wash_url_argument(cmt_id, 'int') if cmt_id <= 0: return 0 (query_res, nb_abuse_reports) = query_record_report_this(cmt_id) if query_res == 0: return 0 elif query_res == -2: return -2 if not(check_user_can_report(cmt_id, client_ip_address, uid)): return 0 action_date = convert_datestruct_to_datetext(time.localtime()) action_code = CFG_WEBCOMMENT_ACTION_CODE['REPORT_ABUSE'] query = """INSERT INTO cmtACTIONHISTORY (id_cmtRECORDCOMMENT, id_bibrec, id_user, client_host, action_time, action_code) VALUES (%s, NULL, %s, inet_aton(%s), %s, %s)""" params = (cmt_id, uid, client_ip_address, action_date, action_code) run_sql(query, params) if nb_abuse_reports % CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN == 0: (cmt_id2, id_bibrec, id_user, cmt_body, cmt_date, cmt_star, cmt_vote, cmt_nb_votes_total, cmt_title, cmt_reported, round_name, restriction) = query_get_comment(cmt_id) (user_nb_abuse_reports, user_votes, user_nb_votes_total) = query_get_user_reports_and_votes(int(id_user)) (nickname, user_email, last_login) = query_get_user_contact_info(id_user) from_addr = '%s Alert Engine <%s>' % (CFG_SITE_NAME, CFG_WEBALERT_ALERT_ENGINE_EMAIL) comment_collection = get_comment_collection(cmt_id) to_addrs = get_collection_moderators(comment_collection) subject = "A comment has been reported as inappropriate by a user" body = ''' The following comment has been reported a total of %(cmt_reported)s times. Author: nickname = %(nickname)s email = %(user_email)s user_id = %(uid)s This user has: total number of reports = %(user_nb_abuse_reports)s %(votes)s Comment: comment_id = %(cmt_id)s record_id = %(id_bibrec)s date written = %(cmt_date)s nb reports = %(cmt_reported)s %(review_stuff)s body = ---start body--- %(cmt_body)s ---end body--- Please go to the record page %(comment_admin_link)s to delete this message if necessary. A warning will be sent to the user in question.''' % \ { 'cfg-report_max' : CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN, 'nickname' : nickname, 'user_email' : user_email, 'uid' : id_user, 'user_nb_abuse_reports' : user_nb_abuse_reports, 'user_votes' : user_votes, 'votes' : CFG_WEBCOMMENT_ALLOW_REVIEWS and \ "total number of positive votes\t= %s\n\t\ttotal number of negative votes\t= %s" % \ (user_votes, (user_nb_votes_total - user_votes)) or "\n", 'cmt_id' : cmt_id, 'id_bibrec' : id_bibrec, 'cmt_date' : cmt_date, 'cmt_reported' : cmt_reported, 'review_stuff' : CFG_WEBCOMMENT_ALLOW_REVIEWS and \ "star score\t= %s\n\treview title\t= %s" % (cmt_star, cmt_title) or "", 'cmt_body' : cmt_body, 'comment_admin_link' : CFG_SITE_URL + "/"+ CFG_SITE_RECORD +"/" + str(id_bibrec) + '/comments#' + str(cmt_id), 'user_admin_link' : "user_admin_link" #! FIXME } #FIXME to be added to email when websession module is over: #If you wish to ban the user, you can do so via the User Admin Panel %(user_admin_link)s. send_email(from_addr, to_addrs, subject, body) return 1 def check_user_can_report(cmt_id, client_ip_address, uid=-1): """ Checks if a user hasn't already reported a comment @param cmt_id: comment id @param client_ip_address: IP => use: str(req.remote_ip) @param uid: user id, as given by invenio.webuser.getUid(req) """ cmt_id = wash_url_argument(cmt_id, 'int') client_ip_address = wash_url_argument(client_ip_address, 'str') uid = wash_url_argument(uid, 'int') query = """SELECT id_cmtRECORDCOMMENT FROM cmtACTIONHISTORY WHERE id_cmtRECORDCOMMENT=%s""" params = (uid,) if uid < 0: query += " AND client_host=inet_aton(%s)" params += (client_ip_address,) else: query += " AND id_user=%s" params += (uid,) res = run_sql(query, params) return (len(res) == 0) def query_get_user_contact_info(uid): """ Get the user contact information @return: tuple (nickname, email, last_login), if none found return () Note: for the moment, if no nickname, will return email address up to the '@' """ query1 = """SELECT nickname, email, DATE_FORMAT(last_login, '%%Y-%%m-%%d %%H:%%i:%%s') FROM user WHERE id=%s""" params1 = (uid,) res1 = run_sql(query1, params1) if res1: return res1[0] else: return () def query_get_user_reports_and_votes(uid): """ Retrieve total number of reports and votes of a particular user @param uid: user id @return: tuple (total_nb_reports, total_nb_votes_yes, total_nb_votes_total) if none found return () """ query1 = """SELECT nb_votes_yes, nb_votes_total, nb_abuse_reports FROM cmtRECORDCOMMENT WHERE id_user=%s""" params1 = (uid,) res1 = run_sql(query1, params1) if len(res1) == 0: return () nb_votes_yes = nb_votes_total = nb_abuse_reports = 0 for cmt_tuple in res1: nb_votes_yes += int(cmt_tuple[0]) nb_votes_total += int(cmt_tuple[1]) nb_abuse_reports += int(cmt_tuple[2]) return (nb_abuse_reports, nb_votes_yes, nb_votes_total) def query_get_comment(comID): """ Get all fields of a comment @param comID: comment id @return: tuple (comID, id_bibrec, id_user, body, date_creation, star_score, nb_votes_yes, nb_votes_total, title, nb_abuse_reports, round_name, restriction) if none found return () """ query1 = """SELECT id, id_bibrec, id_user, body, DATE_FORMAT(date_creation, '%%Y-%%m-%%d %%H:%%i:%%s'), star_score, nb_votes_yes, nb_votes_total, title, nb_abuse_reports, round_name, restriction FROM cmtRECORDCOMMENT WHERE id=%s""" params1 = (comID,) res1 = run_sql(query1, params1) if len(res1)>0: return res1[0] else: return () def query_record_report_this(comID): """ Increment the number of reports for a comment @param comID: comment id @return: tuple (success, new_total_nb_reports_for_this_comment) where success is integer 1 if success, integer 0 if not, -2 if comment does not exist """ #retrieve nb_abuse_reports query1 = "SELECT nb_abuse_reports FROM cmtRECORDCOMMENT WHERE id=%s" params1 = (comID,) res1 = run_sql(query1, params1) if len(res1) == 0: return (-2, 0) #increment and update nb_abuse_reports = int(res1[0][0]) + 1 query2 = "UPDATE cmtRECORDCOMMENT SET nb_abuse_reports=%s WHERE id=%s" params2 = (nb_abuse_reports, comID) res2 = run_sql(query2, params2) return (int(res2), nb_abuse_reports) def query_record_useful_review(comID, value): """ private funciton Adjust the number of useful votes and number of total votes for a comment. @param comID: comment id @param value: +1 or -1 @return: integer 1 if successful, integer 0 if not """ # retrieve nb_useful votes query1 = "SELECT nb_votes_total, nb_votes_yes FROM cmtRECORDCOMMENT WHERE id=%s" params1 = (comID,) res1 = run_sql(query1, params1) if len(res1)==0: return 0 # modify and insert new nb_useful votes nb_votes_yes = int(res1[0][1]) if value >= 1: nb_votes_yes = int(res1[0][1]) + 1 nb_votes_total = int(res1[0][0]) + 1 query2 = "UPDATE cmtRECORDCOMMENT SET nb_votes_total=%s, nb_votes_yes=%s WHERE id=%s" params2 = (nb_votes_total, nb_votes_yes, comID) res2 = run_sql(query2, params2) return int(res2) def query_retrieve_comments_or_remarks(recID, display_order='od', display_since='0000-00-00 00:00:00', ranking=0, limit='all', user_info=None): """ Private function Retrieve tuple of comments or remarks from the database @param recID: record id @param display_order: hh = highest helpful score lh = lowest helpful score hs = highest star score ls = lowest star score od = oldest date nd = newest date @param display_since: datetime, e.g. 0000-00-00 00:00:00 @param ranking: boolean, enabled if reviews, disabled for comments @param limit: number of comments/review to return @return: tuple of comment where comment is tuple (nickname, uid, date_creation, body, status, id) if ranking disabled or tuple (nickname, uid, date_creation, body, status, nb_votes_yes, nb_votes_total, star_score, title, id) Note: for the moment, if no nickname, will return email address up to '@' """ display_since = calculate_start_date(display_since) order_dict = { 'hh' : "cmt.nb_votes_yes/(cmt.nb_votes_total+1) DESC, cmt.date_creation DESC ", 'lh' : "cmt.nb_votes_yes/(cmt.nb_votes_total+1) ASC, cmt.date_creation ASC ", 'ls' : "cmt.star_score ASC, cmt.date_creation DESC ", 'hs' : "cmt.star_score DESC, cmt.date_creation DESC ", 'nd' : "cmt.reply_order_cached_data DESC ", 'od' : "cmt.reply_order_cached_data ASC " } # Ranking only done for comments and when allowed if ranking and recID > 0: try: display_order = order_dict[display_order] except: display_order = order_dict['od'] else: # in case of recID > 0 => external record => no ranking! ranking = 0 try: if display_order[-1] == 'd': display_order = order_dict[display_order] else: display_order = order_dict['od'] except: display_order = order_dict['od'] #display_order = order_dict['nd'] query = """SELECT user.nickname, cmt.id_user, DATE_FORMAT(cmt.date_creation, '%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s'), cmt.body, cmt.status, cmt.nb_abuse_reports, %(ranking)s cmt.id, cmt.round_name, cmt.restriction, %(reply_to_column)s FROM cmtRECORDCOMMENT cmt LEFT JOIN user ON user.id=cmt.id_user WHERE cmt.id_bibrec=%%s %(ranking_only)s %(display_since)s ORDER BY %(display_order)s """ % {'ranking' : ranking and ' cmt.nb_votes_yes, cmt.nb_votes_total, cmt.star_score, cmt.title, ' or '', 'ranking_only' : ranking and ' AND cmt.star_score>0 ' or ' AND cmt.star_score=0 ', # 'id_bibrec' : recID > 0 and 'cmt.id_bibrec' or 'cmt.id_bibrec_or_bskEXTREC', # 'table' : recID > 0 and 'cmtRECORDCOMMENT' or 'bskRECORDCOMMENT', 'display_since' : display_since == '0000-00-00 00:00:00' and ' ' or 'AND cmt.date_creation>=\'%s\' ' % display_since, 'display_order': display_order, 'reply_to_column': recID > 0 and 'cmt.in_reply_to_id_cmtRECORDCOMMENT' or 'cmt.in_reply_to_id_bskRECORDCOMMENT'} params = (recID,) res = run_sql(query, params) # return res new_limit = limit comments_list = [] for row in res: if ranking: # when dealing with reviews, row[12] holds restriction info: restriction = row[12] else: # when dealing with comments, row[8] holds restriction info: restriction = row[8] if user_info and check_user_can_view_comment(user_info, None, restriction)[0] != 0: # User cannot view comment. Look further continue comments_list.append(row) if limit.isdigit(): new_limit -= 1 if limit < 1: break if comments_list: if limit.isdigit(): return comments_list[:limit] else: return comments_list return () # def get_comment_children(comID): # """ # Returns the list of children (i.e. direct descendants) ordered by time of addition. # @param comID: the ID of the comment for which we want to retrieve children # @type comID: int # @return the list of children # @rtype: list # """ # res = run_sql("SELECT id FROM cmtRECORDCOMMENT WHERE in_reply_to_id_cmtRECORDCOMMENT=%s", (comID,)) # return [row[0] for row in res] # def get_comment_descendants(comID, depth=None): # """ # Returns the list of descendants of the given comment, orderd from # oldest to newest ("top-down"), down to depth specified as parameter. # @param comID: the ID of the comment for which we want to retrieve descendant # @type comID: int # @param depth: the max depth down to which we want to retrieve # descendants. Specify None for no limit, 1 for direct # children only, etc. # @return the list of ancestors # @rtype: list(tuple(comment ID, descendants comments IDs)) # """ # if depth == 0: # return (comID, []) # res = run_sql("SELECT id FROM cmtRECORDCOMMENT WHERE in_reply_to_id_cmtRECORDCOMMENT=%s", (comID,)) # if res: # children_comID = [row[0] for row in res] # children_descendants = [] # if depth: # depth -= 1 # children_descendants = [get_comment_descendants(child_comID, depth) for child_comID in children_comID] # return (comID, children_descendants) # else: # return (comID, []) def get_comment_ancestors(comID, depth=None): """ Returns the list of ancestors of the given comment, ordered from oldest to newest ("top-down": direct parent of comID is at last position), up to given depth @param comID: the ID of the comment for which we want to retrieve ancestors @type comID: int @param depth: the maximum of levels up from the given comment we want to retrieve ancestors. None for no limit, 1 for direct parent only, etc. @type depth: int @return the list of ancestors @rtype: list """ if depth == 0: return [] res = run_sql("SELECT in_reply_to_id_cmtRECORDCOMMENT FROM cmtRECORDCOMMENT WHERE id=%s", (comID,)) if res: parent_comID = res[0][0] if parent_comID == 0: return [] parent_ancestors = [] if depth: depth -= 1 parent_ancestors = get_comment_ancestors(parent_comID, depth) parent_ancestors.append(parent_comID) return parent_ancestors else: return [] def get_reply_order_cache_data(comid): """ Prepare a representation of the comment ID given as parameter so that it is suitable for byte ordering in MySQL. """ return "%s%s%s%s" % (chr((comid >> 24) % 256), chr((comid >> 16) % 256), chr((comid >> 8) % 256), chr(comid % 256)) def query_add_comment_or_remark(reviews=0, recID=0, uid=-1, msg="", note="", score=0, priority=0, client_ip_address='', editor_type='textarea', req=None, reply_to=None, attached_files=None): """ Private function Insert a comment/review or remarkinto the database @param recID: record id @param uid: user id @param msg: comment body @param note: comment title @param score: review star score @param priority: remark priority #!FIXME @param editor_type: the kind of editor used to submit the comment: 'textarea', 'ckeditor' @param req: request object. If provided, email notification are sent after we reply to user request. @param reply_to: the id of the comment we are replying to with this inserted comment. @return: integer >0 representing id if successful, integer 0 if not """ current_date = calculate_start_date('0d') #change utf-8 message into general unicode msg = msg.decode('utf-8') note = note.decode('utf-8') #change general unicode back to utf-8 msg = msg.encode('utf-8') note = note.encode('utf-8') msg_original = msg (restriction, round_name) = get_record_status(recID) if attached_files is None: attached_files = {} if reply_to and CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH >= 0: # Check that we have not reached max depth comment_ancestors = get_comment_ancestors(reply_to) if len(comment_ancestors) >= CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH: if CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH == 0: reply_to = None else: reply_to = comment_ancestors[CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH - 1] # Inherit restriction and group/round of 'parent' comment = query_get_comment(reply_to) if comment: (round_name, restriction) = comment[10:12] if editor_type == 'ckeditor': # Here we remove the line feeds introduced by CKEditor (they # have no meaning for the user) and replace the HTML line # breaks by linefeeds, so that we are close to an input that # would be done without the CKEditor. That's much better if a # reply to a comment is made with a browser that does not # support CKEditor. msg = msg.replace('\n', '').replace('\r', '') # We clean the quotes that could have been introduced by # CKEditor when clicking the 'quote' button, as well as those # that we have introduced when quoting the original message. # We can however not use directly '>>' chars to quote, as it # will be washed/fixed when calling tidy_html(): double-escape # all > first, and use >> msg = msg.replace('>', '&gt;') msg = re.sub('^\s* \s*<(p|div).*?>', '>>', msg) msg = re.sub('\s*', '', msg) # Then definitely remove any blockquote, whatever it is msg = re.sub('', '
', msg) msg = re.sub('', '
', msg) # Tidy up the HTML msg = tidy_html(msg) # We remove EOL that might have been introduced when tidying msg = msg.replace('\n', '').replace('\r', '') # Now that HTML has been cleaned, unescape > msg = msg.replace('>', '>') msg = msg.replace('&gt;', '>') msg = re.sub('
)', '\n', msg) msg = msg.replace(' ', ' ') # In case additional

or

got inserted, interpret # these as new lines (with a sad trick to do it only once) # (note that it has been deactivated, as it is messing up # indentation with >>) #msg = msg.replace('
<', '\n<') #msg = msg.replace('

<', '

\n<') query = """INSERT INTO cmtRECORDCOMMENT (id_bibrec, id_user, body, date_creation, star_score, nb_votes_total, title, round_name, restriction, in_reply_to_id_cmtRECORDCOMMENT) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" params = (recID, uid, msg, current_date, score, 0, note, round_name, restriction, reply_to or 0) res = run_sql(query, params) if res: new_comid = int(res) move_attached_files_to_storage(attached_files, recID, new_comid) parent_reply_order = run_sql("""SELECT reply_order_cached_data from cmtRECORDCOMMENT where id=%s""", (reply_to,)) if not parent_reply_order or parent_reply_order[0][0] is None: # This is not a reply, but a first 0-level comment parent_reply_order = '' else: parent_reply_order = parent_reply_order[0][0] - run_sql("""UPDATE cmtRECORDCOMMENT SET reply_order_cached_data=%s WHERE id=%s""", + run_sql("""UPDATE cmtRECORDCOMMENT SET reply_order_cached_data=_binary %s WHERE id=%s""", (parent_reply_order + get_reply_order_cache_data(new_comid), new_comid)) action_code = CFG_WEBCOMMENT_ACTION_CODE[reviews and 'ADD_REVIEW' or 'ADD_COMMENT'] action_time = convert_datestruct_to_datetext(time.localtime()) query2 = """INSERT INTO cmtACTIONHISTORY (id_cmtRECORDCOMMENT, id_bibrec, id_user, client_host, action_time, action_code) VALUES (%s, %s, %s, inet_aton(%s), %s, %s)""" params2 = (res, recID, uid, client_ip_address, action_time, action_code) run_sql(query2, params2) def notify_subscribers_callback(data): """ Define a callback that retrieves subscribed users, and notify them by email. @param data: contains the necessary parameters in a tuple: (recid, uid, comid, msg, note, score, editor_type, reviews) """ recid, uid, comid, msg, note, score, editor_type, reviews = data # Email this comment to 'subscribers' (subscribers_emails1, subscribers_emails2) = \ get_users_subscribed_to_discussion(recid) email_subscribers_about_new_comment(recid, reviews=reviews, emails1=subscribers_emails1, emails2=subscribers_emails2, comID=comid, msg=msg, note=note, score=score, editor_type=editor_type, uid=uid) # Register our callback to notify subscribed people after # having replied to our current user. data = (recID, uid, res, msg, note, score, editor_type, reviews) if req: req.register_cleanup(notify_subscribers_callback, data) else: notify_subscribers_callback(data) return int(res) def move_attached_files_to_storage(attached_files, recID, comid): """ Move the files that were just attached to a new comment to their final location. @param attached_files: the mappings of desired filename to attach and path where to find the original file @type attached_files: dict {filename, filepath} @param recID: the record ID to which we attach the files @param comid: the comment ID to which we attach the files """ for filename, filepath in attached_files.iteritems(): dest_dir = os.path.join(CFG_PREFIX, 'var', 'data', 'comments', str(recID), str(comid)) try: os.makedirs(dest_dir) except: # Dir most probably already existed pass shutil.move(filepath, os.path.join(dest_dir, filename)) def get_attached_files(recid, comid): """ Returns a list with tuples (filename, filepath, fileurl) @param recid: the recid to which the comment belong @param comid: the commment id for which we want to retrieve files """ base_dir = os.path.join(CFG_PREFIX, 'var', 'data', 'comments', str(recid), str(comid)) if os.path.isdir(base_dir): filenames = os.listdir(base_dir) return [(filename, os.path.join(CFG_PREFIX, 'var', 'data', 'comments', str(recid), str(comid), filename), CFG_SITE_URL + '/'+ CFG_SITE_RECORD +'/' + str(recid) + '/comments/attachments/get/' + str(comid) + '/' + filename) \ for filename in filenames] else: return [] def subscribe_user_to_discussion(recID, uid): """ Subscribe a user to a discussion, so the she receives by emails all new new comments for this record. @param recID: record ID corresponding to the discussion we want to subscribe the user @param uid: user id """ query = """INSERT INTO cmtSUBSCRIPTION (id_bibrec, id_user, creation_time) VALUES (%s, %s, %s)""" params = (recID, uid, convert_datestruct_to_datetext(time.localtime())) try: run_sql(query, params) except: return 0 return 1 def unsubscribe_user_from_discussion(recID, uid): """ Unsubscribe users from a discussion. @param recID: record ID corresponding to the discussion we want to unsubscribe the user @param uid: user id @return 1 if successful, 0 if not """ query = """DELETE FROM cmtSUBSCRIPTION WHERE id_bibrec=%s AND id_user=%s""" params = (recID, uid) try: res = run_sql(query, params) except: return 0 if res > 0: return 1 return 0 def get_user_subscription_to_discussion(recID, uid): """ Returns the type of subscription for the given user to this discussion. This does not check authorizations (for eg. if user was subscribed, but is suddenly no longer authorized). @param recID: record ID @param uid: user id @return: - 0 if user is not subscribed to discussion - 1 if user is subscribed, and is allowed to unsubscribe - 2 if user is subscribed, but cannot unsubscribe """ user_email = get_email(uid) (emails1, emails2) = get_users_subscribed_to_discussion(recID, check_authorizations=False) if user_email in emails1: return 1 elif user_email in emails2: return 2 else: return 0 def get_users_subscribed_to_discussion(recID, check_authorizations=True): """ Returns the lists of users subscribed to a given discussion. Two lists are returned: the first one is the list of emails for users who can unsubscribe from the discussion, the second list contains the emails of users who cannot unsubscribe (for eg. author of the document, etc). Users appear in only one list. If a user has manually subscribed to a discussion AND is an automatic recipients for updates, it will only appear in the second list. @param recID: record ID for which we want to retrieve subscribed users @param check_authorizations: if True, check again if users are authorized to view comment @return tuple (emails1, emails2) """ subscribers_emails = {} # Get users that have subscribed to this discussion query = """SELECT id_user FROM cmtSUBSCRIPTION WHERE id_bibrec=%s""" params = (recID,) res = run_sql(query, params) for row in res: uid = row[0] if check_authorizations: user_info = collect_user_info(uid) (auth_code, auth_msg) = check_user_can_view_comments(user_info, recID) else: # Don't check and grant access auth_code = False if auth_code: # User is no longer authorized to view comments. # Delete subscription unsubscribe_user_from_discussion(recID, uid) else: email = get_email(uid) if '@' in email: subscribers_emails[email] = True # Get users automatically subscribed, based on the record metadata collections_with_auto_replies = CFG_WEBCOMMENT_EMAIL_REPLIES_TO.keys() for collection in collections_with_auto_replies: if (get_colID(collection) is not None) and \ (recID in get_collection_reclist(collection)): fields = CFG_WEBCOMMENT_EMAIL_REPLIES_TO[collection] for field in fields: emails = get_fieldvalues(recID, field) for email in emails: if not '@' in email: # Is a group: add domain name subscribers_emails[email + '@' + \ CFG_SITE_SUPPORT_EMAIL.split('@')[1]] = False else: subscribers_emails[email] = False return ([email for email, can_unsubscribe_p \ in subscribers_emails.iteritems() if can_unsubscribe_p], [email for email, can_unsubscribe_p \ in subscribers_emails.iteritems() if not can_unsubscribe_p] ) def email_subscribers_about_new_comment(recID, reviews, emails1, emails2, comID, msg="", note="", score=0, editor_type='textarea', ln=CFG_SITE_LANG, uid=-1): """ Notify subscribers that a new comment was posted. FIXME: consider recipient preference to send email in correct language. @param recID: record id @param emails1: list of emails for users who can unsubscribe from discussion @param emails2: list of emails for users who cannot unsubscribe from discussion @param comID: the comment id @param msg: comment body @param note: comment title @param score: review star score @param editor_type: the kind of editor used to submit the comment: 'textarea', 'ckeditor' @rtype: bool @return: True if email was sent okay, False if it was not. """ _ = gettext_set_language(ln) if not emails1 and not emails2: return 0 # Get title titles = get_fieldvalues(recID, "245__a") if not titles: # usual title not found, try conference title: titles = get_fieldvalues(recID, "111__a") title = '' if titles: title = titles[0] else: title = _("Record %i") % recID # Get report number report_numbers = get_fieldvalues(recID, "037__a") if not report_numbers: report_numbers = get_fieldvalues(recID, "088__a") if not report_numbers: report_numbers = get_fieldvalues(recID, "021__a") # Prepare email subject and body if reviews: email_subject = _('%(report_number)s"%(title)s" has been reviewed') % \ {'report_number': report_numbers and ('[' + report_numbers[0] + '] ') or '', 'title': title} else: email_subject = _('%(report_number)s"%(title)s" has been commented') % \ {'report_number': report_numbers and ('[' + report_numbers[0] + '] ') or '', 'title': title} washer = EmailWasher() msg = washer.wash(msg) msg = msg.replace('>>', '>') email_content = msg if note: email_content = note + email_content # Send emails to people who can unsubscribe email_header = webcomment_templates.tmpl_email_new_comment_header(recID, title, reviews, comID, report_numbers, can_unsubscribe=True, ln=ln, uid=uid) email_footer = webcomment_templates.tmpl_email_new_comment_footer(recID, title, reviews, comID, report_numbers, can_unsubscribe=True, ln=ln) res1 = True if emails1: res1 = send_email(fromaddr=CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL, toaddr=emails1, subject=email_subject, content=email_content, header=email_header, footer=email_footer, ln=ln) # Then send email to people who have been automatically # subscribed to the discussion (they cannot unsubscribe) email_header = webcomment_templates.tmpl_email_new_comment_header(recID, title, reviews, comID, report_numbers, can_unsubscribe=False, ln=ln, uid=uid) email_footer = webcomment_templates.tmpl_email_new_comment_footer(recID, title, reviews, comID, report_numbers, can_unsubscribe=False, ln=ln) res2 = True if emails2: res2 = send_email(fromaddr=CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL, toaddr=emails2, subject=email_subject, content=email_content, header=email_header, footer=email_footer, ln=ln) return res1 and res2 def get_record_status(recid): """ Returns the current status of the record, i.e. current restriction to apply for newly submitted comments, and current commenting round. The restriction to apply can be found in the record metadata, in field(s) defined by config CFG_WEBCOMMENT_RESTRICTION_DATAFIELD. The restriction is empty string "" in cases where the restriction has not explicitely been set, even if the record itself is restricted. @param recid: the record id @type recid: int @return tuple(restriction, round_name), where 'restriction' is empty string when no restriction applies @rtype (string, int) """ collections_with_rounds = CFG_WEBCOMMENT_ROUND_DATAFIELD.keys() commenting_round = "" for collection in collections_with_rounds: # Find the first collection defines rounds field for this # record if get_colID(collection) is not None and \ (recid in get_collection_reclist(collection)): commenting_rounds = get_fieldvalues(recid, CFG_WEBCOMMENT_ROUND_DATAFIELD.get(collection, "")) if commenting_rounds: commenting_round = commenting_rounds[0] break collections_with_restrictions = CFG_WEBCOMMENT_RESTRICTION_DATAFIELD.keys() restriction = "" for collection in collections_with_restrictions: # Find the first collection that defines restriction field for # this record if get_colID(collection) is not None and \ recid in get_collection_reclist(collection): restrictions = get_fieldvalues(recid, CFG_WEBCOMMENT_RESTRICTION_DATAFIELD.get(collection, "")) if restrictions: restriction = restrictions[0] break return (restriction, commenting_round) def calculate_start_date(display_since): """ Private function Returns the datetime of display_since argument in MYSQL datetime format calculated according to the local time. @param display_since: = all= no filtering nd = n days ago nw = n weeks ago nm = n months ago ny = n years ago where n is a single digit number @return: string of wanted datetime. If 'all' given as argument, will return datetext_default datetext_default is defined in miscutils/lib/dateutils and equals 0000-00-00 00:00:00 => MySQL format If bad arguement given, will return datetext_default If library 'dateutil' is not found return datetext_default and register exception. """ time_types = {'d':0, 'w':0, 'm':0, 'y':0} today = datetime.today() try: nb = int(display_since[:-1]) except: return datetext_default if display_since in [None, 'all']: return datetext_default if str(display_since[-1]) in time_types: time_type = str(display_since[-1]) else: return datetext_default # year if time_type == 'y': if (int(display_since[:-1]) > today.year - 1) or (int(display_since[:-1]) < 1): # 1 < nb years < 2008 return datetext_default else: final_nb_year = today.year - nb yesterday = today.replace(year=final_nb_year) # month elif time_type == 'm': try: from dateutil.relativedelta import relativedelta except ImportError: # The dateutil library is only recommended: if not # available, then send warning about this. register_exception(alert_admin=True) return datetext_default # obtain only the date: yyyy-mm-dd date_today = datetime.now().date() final_date = date_today - relativedelta(months=nb) yesterday = today.replace(year=final_date.year, month=final_date.month, day=final_date.day) # week elif time_type == 'w': delta = timedelta(weeks=nb) yesterday = today - delta # day elif time_type == 'd': delta = timedelta(days=nb) yesterday = today - delta return yesterday.strftime("%Y-%m-%d %H:%M:%S") def get_first_comments_or_remarks(recID=-1, ln=CFG_SITE_LANG, nb_comments='all', nb_reviews='all', voted=-1, reported=-1, user_info=None, show_reviews=False): """ Gets nb number comments/reviews or remarks. In the case of comments, will get both comments and reviews Comments and remarks sorted by most recent date, reviews sorted by highest helpful score @param recID: record id @param ln: language @param nb_comments: number of comment or remarks to get @param nb_reviews: number of reviews or remarks to get @param voted: 1 if user has voted for a remark @param reported: 1 if user has reported a comment or review @return: if comment, tuple (comments, reviews) both being html of first nb comments/reviews if remark, tuple (remakrs, None) """ _ = gettext_set_language(ln) warnings = [] voted = wash_url_argument(voted, 'int') reported = wash_url_argument(reported, 'int') ## check recID argument if type(recID) is not int: return () if recID >= 1: #comment or review. NB: suppressed reference to basket (handled in webbasket) if CFG_WEBCOMMENT_ALLOW_REVIEWS: res_reviews = query_retrieve_comments_or_remarks(recID=recID, display_order="hh", ranking=1, limit=nb_comments, user_info=user_info) nb_res_reviews = len(res_reviews) ## check nb argument if type(nb_reviews) is int and nb_reviews < len(res_reviews): first_res_reviews = res_reviews[:nb_reviews] else: first_res_reviews = res_reviews if CFG_WEBCOMMENT_ALLOW_COMMENTS: res_comments = query_retrieve_comments_or_remarks(recID=recID, display_order="od", ranking=0, limit=nb_reviews, user_info=user_info) nb_res_comments = len(res_comments) ## check nb argument if type(nb_comments) is int and nb_comments < len(res_comments): first_res_comments = res_comments[:nb_comments] else: first_res_comments = res_comments else: body = webcomment_templates.tmpl_error( _('%s is an invalid record ID') % recID, ln) return body # comment if recID >= 1: comments = reviews = "" if reported > 0: warnings.append((_('Your feedback has been recorded, many ' 'thanks.'), 'green')) elif reported == 0: warnings.append((_('Your feedback could not be recorded, please' ' try again.'), '')) # normal comments if CFG_WEBCOMMENT_ALLOW_COMMENTS: grouped_comments = group_comments_by_round(first_res_comments, ranking=0) comments = webcomment_templates.tmpl_get_first_comments_without_ranking(recID, ln, grouped_comments, nb_res_comments, warnings) if show_reviews: # ranked comments if CFG_WEBCOMMENT_ALLOW_REVIEWS: # calculate average score avg_score = calculate_avg_score(res_reviews) if voted > 0: warnings.append((_('Your feedback has been recorded, ' 'many thanks.'), 'green')) elif voted == 0: warnings.append((_('Your feedback could not be recorded, ' 'please try again.'), '')) grouped_reviews = group_comments_by_round(first_res_reviews, ranking=0) reviews = webcomment_templates.tmpl_get_first_comments_with_ranking(recID, ln, grouped_reviews, nb_res_reviews, avg_score, warnings) return (comments, reviews) # remark else: return(webcomment_templates.tmpl_get_first_remarks(first_res_comments, ln, nb_res_comments), None) def group_comments_by_round(comments, ranking=0): """ Group comments by the round to which they belong """ comment_rounds = {} ordered_comment_round_names = [] for comment in comments: comment_round_name = ranking and comment[11] or comment[7] if not comment_rounds.has_key(comment_round_name): comment_rounds[comment_round_name] = [] ordered_comment_round_names.append(comment_round_name) comment_rounds[comment_round_name].append(comment) return [(comment_round_name, comment_rounds[comment_round_name]) \ for comment_round_name in ordered_comment_round_names] def calculate_avg_score(res): """ private function Calculate the avg score of reviews present in res @param res: tuple of tuple returned from query_retrieve_comments_or_remarks @return: a float of the average score rounded to the closest 0.5 """ c_star_score = 6 avg_score = 0.0 nb_reviews = 0 for comment in res: if comment[c_star_score] > 0: avg_score += comment[c_star_score] nb_reviews += 1 if nb_reviews == 0: return 0.0 avg_score = avg_score / nb_reviews avg_score_unit = avg_score - math.floor(avg_score) if avg_score_unit < 0.25: avg_score = math.floor(avg_score) elif avg_score_unit > 0.75: avg_score = math.floor(avg_score) + 1 else: avg_score = math.floor(avg_score) + 0.5 if avg_score > 5: avg_score = 5.0 return avg_score def perform_request_add_comment_or_remark(recID=0, uid=-1, action='DISPLAY', ln=CFG_SITE_LANG, msg=None, score=None, note=None, priority=None, reviews=0, comID=0, client_ip_address=None, editor_type='textarea', can_attach_files=False, subscribe=False, req=None, attached_files=None, warnings=None): """ Add a comment/review or remark @param recID: record id @param uid: user id @param action: 'DISPLAY' to display add form 'SUBMIT' to submit comment once form is filled 'REPLY' to reply to an existing comment @param ln: language @param msg: the body of the comment/review or remark @param score: star score of the review @param note: title of the review @param priority: priority of remark (int) @param reviews: boolean, if enabled will add a review, if disabled will add a comment @param comID: if replying, this is the comment id of the comment we are replying to @param editor_type: the kind of editor/input used for the comment: 'textarea', 'ckeditor' @param can_attach_files: if user can attach files to comments or not @param subscribe: if True, subscribe user to receive new comments by email @param req: request object. Used to register callback to send email notification @param attached_files: newly attached files to this comment, mapping filename to filepath @type attached_files: dict @param warnings: list of warning tuples (warning_text, warning_color) that should be considered @return: - html add form if action is display or reply - html successful added form if action is submit """ _ = gettext_set_language(ln) if warnings is None: warnings = [] actions = ['DISPLAY', 'REPLY', 'SUBMIT'] _ = gettext_set_language(ln) ## check arguments check_recID_is_in_range(recID, warnings, ln) if uid <= 0: body = webcomment_templates.tmpl_error( _('%s is an invalid user ID.') % uid, ln) return body if attached_files is None: attached_files = {} user_contact_info = query_get_user_contact_info(uid) nickname = '' if user_contact_info: if user_contact_info[0]: nickname = user_contact_info[0] # show the form if action == 'DISPLAY': if reviews and CFG_WEBCOMMENT_ALLOW_REVIEWS: return webcomment_templates.tmpl_add_comment_form_with_ranking(recID, uid, nickname, ln, msg, score, note, warnings, can_attach_files=can_attach_files) elif not reviews and CFG_WEBCOMMENT_ALLOW_COMMENTS: return webcomment_templates.tmpl_add_comment_form(recID, uid, nickname, ln, msg, warnings, can_attach_files=can_attach_files) else: body = webcomment_templates.tmpl_error( _('Comments on records have been disallowed by the ' 'administrator.'), ln) return body elif action == 'REPLY': if reviews and CFG_WEBCOMMENT_ALLOW_REVIEWS: body = webcomment_templates.tmpl_error( _('Cannot reply to a review.'), ln) return body elif not reviews and CFG_WEBCOMMENT_ALLOW_COMMENTS: textual_msg = msg if comID > 0: comment = query_get_comment(comID) if comment: user_info = get_user_info(comment[2]) if user_info: date_creation = convert_datetext_to_dategui(str(comment[4])) # Build two msg: one mostly textual, the other one with HTML markup, for the CkEditor. msg = _("%(x_name)s wrote on %(x_date)s:")% {'x_name': user_info[2], 'x_date': date_creation} textual_msg = msg # 1 For CkEditor input msg += '\n\n' msg += comment[3] msg = email_quote_txt(text=msg) # Now that we have a text-quoted version, transform into # something that CkEditor likes, using
that # do still enable users to insert comments inline msg = email_quoted_txt2html(text=msg, indent_html=('
', '  
'), linebreak_html=" 
", indent_block=False) # Add some space for users to easily add text # around the quoted message msg = '
' + msg + '
' # Due to how things are done, we need to # escape the whole msg again for the editor msg = cgi.escape(msg) # 2 For textarea input textual_msg += "\n\n" textual_msg += comment[3] textual_msg = email_quote_txt(text=textual_msg) return webcomment_templates.tmpl_add_comment_form(recID, uid, nickname, ln, msg, warnings, textual_msg, can_attach_files=can_attach_files, reply_to=comID) else: body = webcomment_templates.tmpl_error( _('Comments on records have been disallowed by the ' 'administrator.'), ln) return body # check before submitting form elif action == 'SUBMIT': if reviews and CFG_WEBCOMMENT_ALLOW_REVIEWS: if note.strip() in ["", "None"] and not CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS: warnings.append((_('You must enter a title.'), '')) if score == 0 or score > 5: warnings.append((_('You must choose a score.'), '')) if msg.strip() in ["", "None"] and not CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS: warnings.append((_('You must enter a text.'), '')) # if no warnings, submit if len(warnings) == 0: if reviews: if check_user_can_review(recID, client_ip_address, uid): success = query_add_comment_or_remark(reviews, recID=recID, uid=uid, msg=msg, note=note, score=score, priority=0, client_ip_address=client_ip_address, editor_type=editor_type, req=req, reply_to=comID) else: warnings.append((_('You already wrote a review for ' 'this record.'), '')) success = 1 else: if check_user_can_comment(recID, client_ip_address, uid): success = query_add_comment_or_remark(reviews, recID=recID, uid=uid, msg=msg, note=note, score=score, priority=0, client_ip_address=client_ip_address, editor_type=editor_type, req=req, reply_to=comID, attached_files=attached_files) if success > 0 and subscribe: subscribe_user_to_discussion(recID, uid) else: warnings.append((_('You already posted a comment ' 'short ago. Please retry later.'), '')) success = 1 if success > 0: if CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL > 0: notify_admin_of_new_comment(comID=success) return webcomment_templates.tmpl_add_comment_successful(recID, ln, reviews, warnings, success) else: register_exception(req=req) body = webcomment_templates.tmpl_error( _('Failed to insert your comment to the database.' ' Please try again.'), ln) return body # if are warnings or if inserting comment failed, show user where warnings are if reviews and CFG_WEBCOMMENT_ALLOW_REVIEWS: return webcomment_templates.tmpl_add_comment_form_with_ranking(recID, uid, nickname, ln, msg, score, note, warnings, can_attach_files=can_attach_files) else: return webcomment_templates.tmpl_add_comment_form(recID, uid, nickname, ln, msg, warnings, can_attach_files=can_attach_files) # unknown action send to display else: warnings.append((_('Unknown action --> showing you the default ' 'add comment form.'), '')) if reviews and CFG_WEBCOMMENT_ALLOW_REVIEWS: return webcomment_templates.tmpl_add_comment_form_with_ranking(recID, uid, ln, msg, score, note, warnings, can_attach_files=can_attach_files) else: return webcomment_templates.tmpl_add_comment_form(recID, uid, ln, msg, warnings, can_attach_files=can_attach_files) return '' def notify_admin_of_new_comment(comID): """ Sends an email to the admin with details regarding comment with ID = comID """ comment = query_get_comment(comID) if len(comment) > 0: (comID2, id_bibrec, id_user, body, date_creation, star_score, nb_votes_yes, nb_votes_total, title, nb_abuse_reports, round_name, restriction) = comment else: return user_info = query_get_user_contact_info(id_user) if len(user_info) > 0: (nickname, email, last_login) = user_info if not len(nickname) > 0: nickname = email.split('@')[0] else: nickname = email = last_login = "ERROR: Could not retrieve" review_stuff = ''' Star score = %s Title = %s''' % (star_score, title) washer = EmailWasher() try: body = washer.wash(body) except: body = cgi.escape(body) record_info = webcomment_templates.tmpl_email_new_comment_admin(id_bibrec) out = ''' The following %(comment_or_review)s has just been posted (%(date)s). AUTHOR: Nickname = %(nickname)s Email = %(email)s User ID = %(uid)s RECORD CONCERNED: Record ID = %(recID)s URL = <%(siteurl)s/%(CFG_SITE_RECORD)s/%(recID)s/%(comments_or_reviews)s/> %(record_details)s %(comment_or_review_caps)s: %(comment_or_review)s ID = %(comID)s %(review_stuff)s Body = <---------------> %(body)s <---------------> ADMIN OPTIONS: To moderate the %(comment_or_review)s go to %(siteurl)s/%(CFG_SITE_RECORD)s/%(recID)s/%(comments_or_reviews)s/display?%(arguments)s ''' % \ { 'comment_or_review' : star_score > 0 and 'review' or 'comment', 'comment_or_review_caps': star_score > 0 and 'REVIEW' or 'COMMENT', 'comments_or_reviews' : star_score > 0 and 'reviews' or 'comments', 'date' : date_creation, 'nickname' : nickname, 'email' : email, 'uid' : id_user, 'recID' : id_bibrec, 'record_details' : record_info, 'comID' : comID2, 'review_stuff' : star_score > 0 and review_stuff or "", 'body' : body.replace('
','\n'), 'siteurl' : CFG_SITE_URL, 'CFG_SITE_RECORD' : CFG_SITE_RECORD, 'arguments' : 'ln=en&do=od#%s' % comID } from_addr = '%s WebComment <%s>' % (CFG_SITE_NAME, CFG_WEBALERT_ALERT_ENGINE_EMAIL) comment_collection = get_comment_collection(comID) to_addrs = get_collection_moderators(comment_collection) rec_collection = guess_primary_collection_of_a_record(id_bibrec) report_nums = get_fieldvalues(id_bibrec, "037__a") report_nums += get_fieldvalues(id_bibrec, "088__a") report_nums = ', '.join(report_nums) subject = "A new comment/review has just been posted [%s|%s]" % (rec_collection, report_nums) send_email(from_addr, to_addrs, subject, out) def check_recID_is_in_range(recID, warnings=[], ln=CFG_SITE_LANG): """ Check that recID is >= 0 @param recID: record id @param warnings: list of warning tuples (warning_text, warning_color) @return: tuple (boolean, html) where boolean (1=true, 0=false) and html is the body of the page to display if there was a problem """ _ = gettext_set_language(ln) try: recID = int(recID) except: pass if type(recID) is int: if recID > 0: from invenio.search_engine import record_exists success = record_exists(recID) if success == 1: return (1, "") else: if success == -1: status = 'deleted' warning_message = _('The record has been deleted.') else: status = 'inexistant' warning_message = _( 'Record ID %s does not exist in the database.') % recID warnings.append((warning_message, '')) return (0, webcomment_templates.tmpl_record_not_found( status=status, recID=recID, ln=ln)) elif recID == 0: warnings.append((_('No record ID was given.'), '')) return (0, webcomment_templates.tmpl_record_not_found( status='missing', recID=recID, ln=ln)) else: warnings.append((_('Record ID %s is an invalid ID.') % recID, '')) return (0, webcomment_templates.tmpl_record_not_found( status='invalid', recID=recID, ln=ln)) else: warnings.append((_('Record ID %s is not a number.') % recID, '')) return (0, webcomment_templates.tmpl_record_not_found( status='nan', recID=recID, ln=ln)) def check_int_arg_is_in_range(value, name, gte_value, lte_value=None): """ Check that variable with name 'name' >= gte_value & optionally <= lte_value @param value: variable value @param name: variable name @param errors: list of error tuples (error_id, value) @param gte_value: greater than or equal to value @param lte_value: less than or equal to value @return: boolean (1=true, 0=false) """ if type(value) is not int: body = webcomment_templates.tmpl_error('%s is not a number.' % value) return body if value < gte_value: body = webcomment_templates.tmpl_error('%s invalid argument.' % value) return body if lte_value: if value > lte_value: body = webcomment_templates.tmpl_error( '%s invalid argument.' % value) return body return 1 def get_mini_reviews(recid, ln=CFG_SITE_LANG): """ Returns the web controls to add reviews to a record from the detailed record pages mini-panel. @param recid: the id of the displayed record @param ln: the user's language """ if CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS: action = 'SUBMIT' else: action = 'DISPLAY' reviews = query_retrieve_comments_or_remarks(recid, ranking=1) return webcomment_templates.tmpl_mini_review(recid, ln, action=action, avg_score=calculate_avg_score(reviews), nb_comments_total=len(reviews)) def check_user_can_view_comments(user_info, recid): """Check if the user is authorized to view comments for given recid. Returns the same type as acc_authorize_action """ # Check user can view the record itself first (auth_code, auth_msg) = check_user_can_view_record(user_info, recid) if auth_code: return (auth_code, auth_msg) # Check if user can view the comments ## But first can we find an authorization for this case action, ## for this collection? record_primary_collection = guess_primary_collection_of_a_record(recid) return acc_authorize_action(user_info, 'viewcomment', authorized_if_no_roles=True, collection=record_primary_collection) def check_user_can_view_comment(user_info, comid, restriction=None): """Check if the user is authorized to view a particular comment, given the comment restriction. Note that this function does not check if the record itself is restricted to the user, which would mean that the user should not see the comment. You can omit 'comid' if you already know the 'restriction' @param user_info: the user info object @param comid: the comment id of that we want to check @param restriction: the restriction applied to given comment (if known. Otherwise retrieved automatically) @return: the same type as acc_authorize_action """ if restriction is None: comment = query_get_comment(comid) if comment: restriction = comment[11] else: return (1, 'Comment %i does not exist' % comid) if restriction == "": return (0, '') return acc_authorize_action(user_info, 'viewrestrcomment', status=restriction) def check_user_can_send_comments(user_info, recid): """Check if the user is authorized to comment the given recid. This function does not check that user can view the record or view the comments Returns the same type as acc_authorize_action """ ## First can we find an authorization for this case, action + collection record_primary_collection = guess_primary_collection_of_a_record(recid) return acc_authorize_action(user_info, 'sendcomment', authorized_if_no_roles=True, collection=record_primary_collection) def check_comment_belongs_to_record(comid, recid): """ Return True if the comment is indeed part of given record (even if comment or/and record have been "deleted"). Else return False. @param comid: the id of the comment to check membership @param recid: the recid of the record we want to check if comment belongs to """ query = """SELECT id_bibrec from cmtRECORDCOMMENT WHERE id=%s""" params = (comid,) res = run_sql(query, params) if res and res[0][0] == recid: return True return False def check_user_can_attach_file_to_comments(user_info, recid): """Check if the user is authorized to attach a file to comments for given recid. This function does not check that user can view the comments or send comments. Returns the same type as acc_authorize_action """ ## First can we find an authorization for this case action, for ## this collection? record_primary_collection = guess_primary_collection_of_a_record(recid) return acc_authorize_action(user_info, 'attachcommentfile', authorized_if_no_roles=False, collection=record_primary_collection) def toggle_comment_visibility(uid, comid, collapse, recid): """ Toggle the visibility of the given comment (collapse) for the given user. Return the new visibility @param uid: the user id for which the change applies @param comid: the comment id to close/open @param collapse: if the comment is to be closed (1) or opened (0) @param recid: the record id to which the comment belongs @return: if the comment is visible or not after the update """ # We rely on the client to tell if comment should be collapsed or # developed, to ensure consistency between our internal state and # client state. Even if not strictly necessary, we store the # record ID for quicker retrieval of the collapsed comments of a # given discussion page. To prevent unnecessary population of the # table, only one distinct tuple (record ID, comment ID, user ID) # can be inserted (due to table definition). For the same purpose # we also check that comment to collapse exists, and corresponds # to an existing record: we cannot rely on the recid found as part # of the URL, as no former check is done. This rule is not applied # when deleting an entry, as in the worst case no line would be # removed. For optimized retrieval of row to delete, the id_bibrec # column is used, though not strictly necessary. if collapse: query = """SELECT id_bibrec from cmtRECORDCOMMENT WHERE id=%s""" params = (comid,) res = run_sql(query, params) if res: query = """INSERT IGNORE INTO cmtCOLLAPSED (id_bibrec, id_cmtRECORDCOMMENT, id_user) VALUES (%s, %s, %s)""" params = (res[0][0], comid, uid) run_sql(query, params) return True else: query = """DELETE FROM cmtCOLLAPSED WHERE id_cmtRECORDCOMMENT=%s and id_user=%s and id_bibrec=%s""" params = (comid, uid, recid) run_sql(query, params) return False def get_user_collapsed_comments_for_record(uid, recid): """ Get the comments collapsed for given user on given recid page """ # Collapsed state is not an attribute of cmtRECORDCOMMENT table # (vary per user) so it cannot be found when querying for the # comment. We must therefore provide a efficient way to retrieve # the collapsed state for a given discussion page and user. query = """SELECT id_cmtRECORDCOMMENT from cmtCOLLAPSED WHERE id_user=%s and id_bibrec=%s""" params = (uid, recid) return [res[0] for res in run_sql(query, params)] def is_comment_deleted(comid): """ Return True of the comment is deleted. Else False @param comid: ID of comment to check """ query = "SELECT status from cmtRECORDCOMMENT WHERE id=%s" params = (comid,) res = run_sql(query, params) if res and res[0][0] != 'ok': return True return False def perform_display_your_comments(user_info, page_number=1, selected_order_by_option="lcf", selected_display_number_option="all", selected_display_format_option="rc", ln=CFG_SITE_LANG): """ Display all comments submitted by the user. @TODO: support reviews too @param user_info: standard user info object. @param comments: ordered list of tuples (id_bibrec, comid, date_creation, body, status, in_reply_to_id_cmtRECORDCOMMENT) @param page_number: page on which the user is. @type page_number: integer @param selected_order_by_option: seleccted ordering option. Can be one of: - ocf: Oldest comment first - lcf: Latest comment first - grof: Group by record, oldest commented first - grlf: Group by record, latest commented first @type selected_order_by_option: string @param selected_display_number_option: number of results to show per page. Can be a string-digit or 'all'. @type selected_display_number_option: string @param selected_display_format_option: how to show records. Can be one of: - rc: Records and comments - ro: Records only - co: Comments only @type selected_display_format_option: string @ln: language @type ln: string """ query_params = "" nb_total_pages = 0 if selected_display_format_option in ('rc', 'co'): nb_total_results = run_sql("SELECT count(id) from cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0", \ (user_info['uid'], ))[0][0] else: if selected_order_by_option in ('grlf', 'grof'): nb_total_results = run_sql("SELECT count(distinct(id_bibrec)) from cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0", \ (user_info['uid'], ))[0][0] else: nb_total_results = run_sql("SELECT count(id_bibrec) from cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0", \ (user_info['uid'], ))[0][0] if page_number < 1: page_number = 1 if selected_display_number_option != 'all' and \ not selected_display_number_option.isdigit(): # must be some garbage selected_display_number_option = 'all' query = '' if selected_order_by_option == "lcf": query_params += " ORDER BY date_creation DESC" elif selected_order_by_option == "ocf": query_params += " ORDER BY date_creation ASC" elif selected_order_by_option == "grlf": query = "SELECT cmt.id_bibrec, cmt.id, cmt.date_creation, cmt.body, cmt.status, cmt.in_reply_to_id_cmtRECORDCOMMENT FROM cmtRECORDCOMMENT as cmt left join (SELECT max(date_creation) as maxdatecreation, id_bibrec FROM cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0 GROUP BY id_bibrec) as grp on cmt.id_bibrec = grp.id_bibrec WHERE id_user=%s AND star_score = 0 ORDER BY grp.maxdatecreation DESC, cmt.date_creation DESC" elif selected_order_by_option == "grof": query = "SELECT cmt.id_bibrec, cmt.id, cmt.date_creation, cmt.body, cmt.status, cmt.in_reply_to_id_cmtRECORDCOMMENT FROM cmtRECORDCOMMENT as cmt left join (SELECT min(date_creation) as mindatecreation, id_bibrec FROM cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0 GROUP BY id_bibrec) as grp on cmt.id_bibrec = grp.id_bibrec WHERE id_user=%s AND star_score = 0 ORDER BY grp.mindatecreation ASC" if selected_display_number_option.isdigit(): selected_display_number_option_as_int = int(selected_display_number_option) if selected_display_number_option_as_int < 5: selected_display_number_option_as_int = 5 selected_display_number_option = str(selected_display_number_option_as_int) from_index = (page_number - 1) * int(selected_display_number_option) query_params += ' LIMIT ' + \ str(from_index) + \ ',' + \ str(int(selected_display_number_option)) nb_total_pages = int(math.ceil(float(nb_total_results) / selected_display_number_option_as_int)) if selected_order_by_option in ("grlf", "grof"): res = run_sql(query + query_params, (user_info['uid'], user_info['uid'])) else: res = run_sql("SELECT id_bibrec, id, date_creation, body, status, in_reply_to_id_cmtRECORDCOMMENT FROM cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0" + query_params, (user_info['uid'], )) return webcomment_templates.tmpl_your_comments(user_info, res, page_number=page_number, selected_order_by_option=selected_order_by_option, selected_display_number_option=selected_display_number_option, selected_display_format_option=selected_display_format_option, nb_total_results=nb_total_results, nb_total_pages=nb_total_pages, ln=ln) diff --git a/modules/webcomment/lib/webcommentadminlib.py b/modules/webcomment/lib/webcommentadminlib.py index fac14b4b6..b586d7ccd 100644 --- a/modules/webcomment/lib/webcommentadminlib.py +++ b/modules/webcomment/lib/webcommentadminlib.py @@ -1,690 +1,690 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN. +# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. __revision__ = "$Id$" from invenio.config import CFG_SITE_LANG, CFG_SITE_URL, \ CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN from invenio.webcomment_config import InvenioWebCommentWarning from invenio.webcomment import query_get_comment, \ get_reply_order_cache_data from invenio.urlutils import wash_url_argument from invenio.dbquery import run_sql from invenio.messages import gettext_set_language, wash_language from invenio.errorlib import register_exception from invenio.webuser import get_user_info, collect_user_info, \ isUserAdmin from invenio.access_control_engine import acc_authorize_action, \ acc_get_authorized_emails from invenio.search_engine import perform_request_search import invenio.template webcomment_templates = invenio.template.load('webcomment') def getnavtrail(previous = '', ln=CFG_SITE_LANG): """Get the navtrail""" previous = wash_url_argument(previous, 'str') ln = wash_language(ln) _ = gettext_set_language(ln) navtrail = """%s """ % (CFG_SITE_URL, _("Admin Area")) navtrail = navtrail + previous return navtrail def get_nb_reviews(recID, count_deleted=True): """ Return number of reviews for the record recID if count_deleted is True, deleted reviews are also counted """ query = """SELECT count(*) FROM cmtRECORDCOMMENT c WHERE c.id_bibrec = %s and c.star_score > 0 """ if not count_deleted: query += "and c.status != 'dm' and c.status != 'da'" res = run_sql(query, (recID,)) return res[0][0] def get_nb_comments(recID, count_deleted=True): """ Return number of comments for the record recID if count_deleted is True, deleted comments are also counted """ query = """SELECT count(*) FROM cmtRECORDCOMMENT c WHERE c.id_bibrec = %s and c.star_score = 0 """ if not count_deleted: query += "and c.status != 'dm' and c.status != 'da'" res = run_sql(query, (recID,)) return res[0][0] def get_user_collections(req): """ Return collections for which the user is moderator """ user_info = collect_user_info(req) res = [] collections = run_sql('SELECT name FROM collection') for collection in collections: collection_emails = acc_get_authorized_emails('moderatecomments', collection=collection[0]) if user_info['email'] in collection_emails or isUserAdmin(user_info): res.append(collection[0]) return res def perform_request_index(ln=CFG_SITE_LANG): """ """ return webcomment_templates.tmpl_admin_index(ln=ln) def perform_request_delete(comID=-1, recID=-1, uid=-1, reviews="", ln=CFG_SITE_LANG): """ """ _ = gettext_set_language(ln) from invenio.search_engine import record_exists warnings = [] ln = wash_language(ln) comID = wash_url_argument(comID, 'int') recID = wash_url_argument(recID, 'int') uid = wash_url_argument(uid, 'int') # parameter reviews is deduced from comID when needed if comID is not None and recID is not None and uid is not None: if comID <= 0 and recID <= 0 and uid <= 0: if comID != -1: try: raise InvenioWebCommentWarning(_('Invalid comment ID.')) except InvenioWebCommentWarning, exc: register_exception(stream='warning') warnings.append((exc.message, '')) #warnings.append(("WRN_WEBCOMMENT_ADMIN_INVALID_COMID",)) return webcomment_templates.tmpl_admin_delete_form(ln, warnings) if comID > 0 and not recID > 0: comment = query_get_comment(comID) if comment: # Figure out if this is a review or a comment c_star_score = 5 if comment[c_star_score] > 0: reviews = 1 else: reviews = 0 return (perform_request_comments(ln=ln, comID=comID, recID=recID, reviews=reviews), None, warnings) else: try: raise InvenioWebCommentWarning(_('Comment ID %s does not exist.') % comID) except InvenioWebCommentWarning, exc: register_exception(stream='warning') warnings.append((exc.message, '')) #warnings.append(('WRN_WEBCOMMENT_ADMIN_COMID_INEXISTANT', comID)) return webcomment_templates.tmpl_admin_delete_form(ln, warnings) elif recID > 0: if record_exists(recID): comID = '' reviews = wash_url_argument(reviews, 'int') return (perform_request_comments(ln=ln, comID=comID, recID=recID, reviews=reviews), None, warnings) else: try: raise InvenioWebCommentWarning(_('Record ID %s does not exist.') % comID) except InvenioWebCommentWarning, exc: register_exception(stream='warning') warnings.append((exc.message, '')) #warnings.append(('WRN_WEBCOMMENT_ADMIN_RECID_INEXISTANT', comID)) return webcomment_templates.tmpl_admin_delete_form(ln, warnings) else: return webcomment_templates.tmpl_admin_delete_form(ln, warnings) else: return webcomment_templates.tmpl_admin_delete_form(ln, warnings) def perform_request_users(ln=CFG_SITE_LANG): """ """ ln = wash_language(ln) users_data = query_get_users_reported() return webcomment_templates.tmpl_admin_users(ln=ln, users_data=users_data) def query_get_users_reported(): """ Get the users who have been reported at least once. @return: tuple of ct, i.e. (ct, ct, ...) where ct is a tuple (total_number_reported, total_comments_reported, total_reviews_reported, total_nb_votes_yes_of_reported, total_nb_votes_total_of_reported, user_id, user_email, user_nickname) sorted by order of ct having highest total_number_reported """ query1 = "SELECT c.nb_abuse_reports, c.nb_votes_yes, c.nb_votes_total, u.id, u.email, u.nickname, c.star_score " \ "FROM user AS u, cmtRECORDCOMMENT AS c " \ "WHERE c.id_user=u.id AND c.nb_abuse_reports > 0 " \ "ORDER BY u.id " res1 = run_sql(query1) if type(res1) is None: return () users = {} for cmt in res1: uid = int(cmt[3]) if users.has_key(uid): users[uid] = (users[uid][0]+int(cmt[0]), int(cmt[6])>0 and users[uid][1] or users[uid][1]+1, int(cmt[6])>0 and users[uid][2]+1 or users[uid][2], users[uid][3]+int(cmt[1]), users[uid][4]+int(cmt[2]), int(cmt[3]), cmt[4], cmt[5]) else: users[uid] = (int(cmt[0]), int(cmt[6])==0 and 1 or 0, int(cmt[6])>0 and 1 or 0, int(cmt[1]), int(cmt[2]), int(cmt[3]), cmt[4], cmt[5]) users = users.values() users.sort() users.reverse() users = tuple(users) return users def perform_request_comments(req=None, ln=CFG_SITE_LANG, uid="", comID="", recID="", reviews=0, abuse=False, collection=""): """ Display the list of comments/reviews along with information about the comment. Display the comment given by its ID, or the list of comments for the given record ID. If abuse == True, only list records reported as abuse. If comID and recID are not provided, list all comments, or all abused comments (check parameter 'abuse') """ ln = wash_language(ln) uid = wash_url_argument(uid, 'int') comID = wash_url_argument(comID, 'int') recID = wash_url_argument(recID, 'int') reviews = wash_url_argument(reviews, 'int') collection = wash_url_argument(collection, 'str') user_info = collect_user_info(req) user_collections = ['Show all'] user_collections.extend(get_user_collections(req)) if collection and collection != 'Show all': (auth_code, auth_msg) = acc_authorize_action(req, 'moderatecomments', collection=collection) if auth_code: return webcomment_templates.tmpl_admin_comments(ln=ln, uid=uid, comID=comID, recID=recID, comment_data=None, reviews=reviews, error=1, user_collections=user_collections, collection=collection) if collection: if recID or uid: comments = query_get_comments(uid, comID, recID, reviews, ln, abuse=abuse, user_collections=user_collections, collection=collection) else: comments = query_get_comments('', comID, '', reviews, ln, abuse=abuse, user_collections=user_collections, collection=collection) else: if recID or uid: comments = query_get_comments(uid, comID, recID, reviews, ln, abuse=abuse, user_collections=user_collections, collection=user_collections[0]) else: comments = query_get_comments('', comID, '', reviews, ln, abuse=abuse, user_collections=user_collections, collection=user_collections[0]) if comments: return webcomment_templates.tmpl_admin_comments(ln=ln, uid=uid, comID=comID, recID=recID, comment_data=comments, reviews=reviews, error=0, user_collections=user_collections, collection=collection) else: return webcomment_templates.tmpl_admin_comments(ln=ln, uid=uid, comID=comID, recID=recID, comment_data=comments, reviews=reviews, error=2, user_collections=user_collections, collection=collection) def perform_request_hot(req=None, ln=CFG_SITE_LANG, comments=1, top=10, collection="Show all"): """ Display the list of hottest comments/reviews along with information about the comment. @param req: request object for obtaining user information @param ln: language @param comments: boolean activated if using comments, deactivated for reviews @param top: specify number of results to be shown @param collection: filter by collection """ ln = wash_language(ln) comments = wash_url_argument(comments, 'int') top = wash_url_argument(top, 'int') collection = wash_url_argument(collection, 'str') user_info = collect_user_info(req) user_collections = ['Show all'] user_collections.extend(get_user_collections(req)) if collection and collection != 'Show all': (auth_code, auth_msg) = acc_authorize_action(req, 'moderatecomments', collection=collection) if auth_code: return webcomment_templates.tmpl_admin_hot(ln=ln, comment_data = None, comments=comments, error=1, user_collections=user_collections, collection=collection) if collection: comments_retrieved = query_get_hot(comments, ln, top, user_collections, collection) else: comments_retrieved = query_get_hot(comments, ln, top, user_collections, user_collections[0]) if comments_retrieved: return webcomment_templates.tmpl_admin_hot(ln=ln, comment_data=comments_retrieved, comments=comments, error=0, user_collections=user_collections, collection=collection) else: return webcomment_templates.tmpl_admin_hot(ln=ln, comment_data=comments_retrieved, comments=comments, error=2, user_collections=user_collections, collection=collection) def perform_request_latest(req=None, ln=CFG_SITE_LANG, comments=1, top=10, collection=""): """ Display the list of latest comments/reviews along with information about the comment. @param req: request object for obtaining user information @param ln: language @param comments: boolean activated if using comments, deactivated for reviews @param top: Specify number of results to be shown @param collection: filter by collection """ ln = wash_language(ln) comments = wash_url_argument(comments, 'int') top = wash_url_argument(top, 'int') collection = wash_url_argument(collection, 'str') user_info = collect_user_info(req) user_collections = ['Show all'] user_collections.extend(get_user_collections(req)) if collection and collection != 'Show all': (auth_code, auth_msg) = acc_authorize_action(req, 'moderatecomments', collection=collection) if auth_code: return webcomment_templates.tmpl_admin_latest(ln=ln, comment_data=None, comments=comments, error=1, user_collections=user_collections, collection=collection) if collection: comments_retrieved = query_get_latest(comments, ln, top, user_collections, collection) else: comments_retrieved = query_get_latest(comments, ln, top, user_collections, user_collections[0]) if comments_retrieved: return webcomment_templates.tmpl_admin_latest(ln=ln, comment_data=comments_retrieved, comments=comments, error=0, user_collections=user_collections, collection=collection) else: return webcomment_templates.tmpl_admin_latest(ln=ln, comment_data=comments_retrieved, comments=comments, error=2, user_collections=user_collections, collection=collection) def perform_request_undel_single_com(ln=CFG_SITE_LANG, id=id): """ Mark comment referenced by id as active """ ln = wash_language(ln) id = wash_url_argument(id, 'int') return query_undel_single_comment(id) def query_get_comments(uid, cmtID, recID, reviews, ln, abuse=False, user_collections='', collection=''): """ private function @param user_collections: allowed collections for the user @param collection: collection to display @return tuple of comment where comment is tuple (nickname, uid, date_creation, body, id, status) if ranking disabled or tuple (nickname, uid, date_creation, body, nb_votes_yes, nb_votes_total, star_score, title, id, status) """ qdict = {'id': 0, 'id_bibrec': 1, 'uid': 2, 'date_creation': 3, 'body': 4, 'status': 5, 'nb_abuse_reports': 6, 'nb_votes_yes': 7, 'nb_votes_total': 8, 'star_score': 9, 'title': 10, 'email': -2, 'nickname': -1} query = """SELECT c.id, c.id_bibrec, c.id_user, DATE_FORMAT(c.date_creation, '%%Y-%%m-%%d %%H:%%i:%%S'), c.body, c.status, c.nb_abuse_reports, %s u.email, u.nickname FROM cmtRECORDCOMMENT c LEFT JOIN user u ON c.id_user = u.id %s ORDER BY c.nb_abuse_reports DESC, c.nb_votes_yes DESC, c.date_creation """ select_fields = reviews and 'c.nb_votes_yes, c.nb_votes_total, c.star_score, c.title,' or '' where_clause = "WHERE " + (reviews and 'c.star_score>0' or 'c.star_score=0') if uid: where_clause += ' AND c.id_user=%i' % uid if recID: where_clause += ' AND c.id_bibrec=%i' % recID if cmtID: where_clause += ' AND c.id=%i' % cmtID if abuse: where_clause += ' AND c.nb_abuse_reports>0' res = run_sql(query % (select_fields, where_clause)) collection_records = [] if collection == 'Show all': for collection_name in user_collections: collection_records.extend(perform_request_search(cc=collection_name)) else: collection_records.extend(perform_request_search(cc=collection)) output = [] for qtuple in res: if qtuple[qdict['id_bibrec']] in collection_records: nickname = qtuple[qdict['nickname']] or get_user_info(qtuple[qdict['uid']], ln)[2] if reviews: comment_tuple = (nickname, qtuple[qdict['uid']], qtuple[qdict['date_creation']], qtuple[qdict['body']], qtuple[qdict['nb_votes_yes']], qtuple[qdict['nb_votes_total']], qtuple[qdict['star_score']], qtuple[qdict['title']], qtuple[qdict['id']], qtuple[qdict['status']]) else: comment_tuple = (nickname, qtuple[qdict['uid']], qtuple[qdict['date_creation']], qtuple[qdict['body']], qtuple[qdict['id']], qtuple[qdict['status']]) general_infos_tuple = (nickname, qtuple[qdict['uid']], qtuple[qdict['email']], qtuple[qdict['id']], qtuple[qdict['id_bibrec']], qtuple[qdict['nb_abuse_reports']]) out_tuple = (comment_tuple, general_infos_tuple) output.append(out_tuple) return tuple(output) def query_get_hot(comments, ln, top, user_collections, collection): """ private function @param comments: boolean indicating if we want to retrieve comments or reviews @param ln: language @param top: number of results to display @param user_collections: allowed collections for the user @param collection: collection to display @return: tuple (id_bibrec, date_last_comment, users, count) """ qdict = {'id_bibrec': 0, 'date_last_comment': 1, 'users': 2, 'total_count': 3} query = """SELECT c.id_bibrec, DATE_FORMAT(max(c.date_creation), '%%Y-%%m-%%d %%H:%%i:%%S') as date_last_comment, count(distinct c.id_user) as users, count(*) as count FROM cmtRECORDCOMMENT c %s GROUP BY c.id_bibrec ORDER BY count(*) DESC LIMIT %s """ where_clause = "WHERE " + (comments and 'c.star_score=0' or 'c.star_score>0') + ' AND c.status="ok" AND c.nb_abuse_reports < %s' % CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN res = run_sql(query % (where_clause, top)) collection_records = [] if collection == 'Show all': for collection_name in user_collections: collection_records.extend(perform_request_search(cc=collection_name)) else: collection_records.extend(perform_request_search(cc=collection)) output = [] for qtuple in res: if qtuple[qdict['id_bibrec']] in collection_records: general_infos_tuple = (qtuple[qdict['id_bibrec']], qtuple[qdict['date_last_comment']], qtuple[qdict['users']], qtuple[qdict['total_count']]) output.append(general_infos_tuple) return tuple(output) def query_get_latest(comments, ln, top, user_collections, collection): """ private function @param comments: boolean indicating if we want to retrieve comments or reviews @param ln: language @param top: number of results to display @param user_collections: allowed collections for the user @param collection: collection to display @return tuple of comment where comment is tuple (nickname, uid, date_creation, body, id) if latest comments or tuple (nickname, uid, date_creation, body, star_score, id) if latest reviews """ qdict = {'id': 0, 'id_bibrec': 1, 'uid': 2, 'date_creation': 3, 'body': 4, 'nb_abuse_reports': 5, 'star_score': 6, 'nickname': -1} query = """SELECT c.id, c.id_bibrec, c.id_user, DATE_FORMAT(c.date_creation, '%%Y-%%m-%%d %%H:%%i:%%S'), c.body, c.nb_abuse_reports, %s u.nickname FROM cmtRECORDCOMMENT c LEFT JOIN user u ON c.id_user = u.id %s ORDER BY c.date_creation DESC LIMIT %s """ select_fields = not comments and 'c.star_score, ' or '' where_clause = "WHERE " + (comments and 'c.star_score=0' or 'c.star_score>0') + ' AND c.status="ok" AND c.nb_abuse_reports < %s' % CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN res = run_sql(query % (select_fields, where_clause, top)) collection_records = [] if collection == 'Show all': for collection_name in user_collections: collection_records.extend(perform_request_search(cc=collection_name)) else: collection_records.extend(perform_request_search(cc=collection)) output = [] for qtuple in res: if qtuple[qdict['id_bibrec']] in collection_records: nickname = qtuple[qdict['nickname']] or get_user_info(qtuple[qdict['uid']], ln)[2] if not comments: comment_tuple = (nickname, qtuple[qdict['uid']], qtuple[qdict['date_creation']], qtuple[qdict['body']], qtuple[qdict['star_score']], qtuple[qdict['id']]) else: comment_tuple = (nickname, qtuple[qdict['uid']], qtuple[qdict['date_creation']], qtuple[qdict['body']], qtuple[qdict['id']]) general_infos_tuple = (nickname, qtuple[qdict['uid']], qtuple[qdict['id']], qtuple[qdict['id_bibrec']], qtuple[qdict['nb_abuse_reports']]) out_tuple = (comment_tuple, general_infos_tuple) output.append(out_tuple) return tuple(output) def perform_request_del_com(ln=CFG_SITE_LANG, comIDs=[]): """ private function Delete the comments and say whether successful or not @param ln: language @param comIDs: list of comment ids """ ln = wash_language(ln) comIDs = wash_url_argument(comIDs, 'list') # map ( fct, list, arguments of function) comIDs = map(wash_url_argument, comIDs, ('int '*len(comIDs)).split(' ')[:-1]) if not comIDs: comIDs = map(coerce, comIDs, ('0 '*len(comIDs)).split(' ')[:-1]) return webcomment_templates.tmpl_admin_del_com(del_res=comIDs, ln=ln) del_res = [] for comID in comIDs: del_res.append((comID, query_delete_comment_mod(comID))) return webcomment_templates.tmpl_admin_del_com(del_res=del_res, ln=ln) def perform_request_undel_com(ln=CFG_SITE_LANG, comIDs=[]): """ private function Undelete the comments and say whether successful or not @param ln: language @param comIDs: list of comment ids """ ln = wash_language(ln) comIDs = wash_url_argument(comIDs, 'list') # map ( fct, list, arguments of function) comIDs = map(wash_url_argument, comIDs, ('int '*len(comIDs)).split(' ')[:-1]) if not comIDs: comIDs = map(coerce, comIDs, ('0 '*len(comIDs)).split(' ')[:-1]) return webcomment_templates.tmpl_admin_undel_com(del_res=comIDs, ln=ln) del_res = [] for comID in comIDs: del_res.append((comID, query_undel_single_comment(comID))) return webcomment_templates.tmpl_admin_undel_com(del_res=del_res, ln=ln) def perform_request_del_single_com_mod(ln=CFG_SITE_LANG, id=id): """ private function Delete a single comment requested by a moderator @param ln: language @param id: comment id to be deleted """ ln = wash_language(ln) id = wash_url_argument(id, 'int') return query_delete_comment_mod(id) def perform_request_del_single_com_auth(ln=CFG_SITE_LANG, id=id): """ private function Delete a single comment requested by the author @param ln: language @param id: comment id to be deleted """ ln = wash_language(ln) id = wash_url_argument(id, 'int') return query_delete_comment_auth(id) def perform_request_unreport_single_com(ln=CFG_SITE_LANG, id=""): """ private function Unreport a single comment @param ln: language @param id: comment id to be deleted """ ln = wash_language(ln) id = wash_url_argument(id, 'int') return query_suppress_abuse_report(id) def suppress_abuse_report(ln=CFG_SITE_LANG, comIDs=[]): """ private function suppress the abuse reports for the given comIDs. @param ln: language @param comIDs: list of ids to suppress attached reports. """ ln = wash_language(ln) comIDs = wash_url_argument(comIDs, 'list') # map ( fct, list, arguments of function) comIDs = map(wash_url_argument, comIDs, ('int '*len(comIDs)).split(' ')[:-1]) if not comIDs: comIDs = map(coerce, comIDs, ('0 '*len(comIDs)).split(' ')[:-1]) return webcomment_templates.tmpl_admin_del_com(del_res=comIDs, ln=ln) del_res = [] for comID in comIDs: del_res.append((comID, query_suppress_abuse_report(comID))) return webcomment_templates.tmpl_admin_suppress_abuse_report(del_res=del_res, ln=ln) def query_suppress_abuse_report(comID): """ suppress abuse report for a given comment @return: integer 1 if successful, integer 0 if not """ query = "UPDATE cmtRECORDCOMMENT SET nb_abuse_reports=0, status='ap' WHERE id=%s" params = (comID,) res = run_sql(query, params) return int(res) def query_delete_comment_mod(comID): """ delete comment with id comID @return: integer 1 if successful, integer 0 if not """ query1 = "UPDATE cmtRECORDCOMMENT SET status='dm' WHERE id=%s" params1 = (comID,) res1 = run_sql(query1, params1) return int(res1) def query_delete_comment_auth(comID): """ delete comment with id comID @return: integer 1 if successful, integer 0 if not """ query1 = "UPDATE cmtRECORDCOMMENT SET status='da' WHERE id=%s" params1 = (comID,) res1 = run_sql(query1, params1) return int(res1) def query_undel_single_comment(comID): """ undelete comment with id comID @return: integer 1 if successful, integer 0 if not """ query = "UPDATE cmtRECORDCOMMENT SET status='ok' WHERE id=%s" params = (comID,) res = run_sql(query, params) return int(res) def check_user_is_author(user_id, com_id): """ Check if the user is the author of the given comment """ res = run_sql("SELECT id, id_user FROM cmtRECORDCOMMENT WHERE id=%s and id_user=%s", (str(com_id), str(user_id))) if res: return 1 return 0 def migrate_comments_populate_threads_index(): """ Fill in the `reply_order_cached_data' columns in cmtRECORDCOMMENT and bskRECORDCOMMENT tables with adequate values so that thread are displayed correctly. """ # Update WebComment comments res = run_sql("SELECT id FROM cmtRECORDCOMMENT WHERE reply_order_cached_data is NULL") for row in res: reply_order_cached_data = get_reply_order_cache_data(row[0]) - run_sql("UPDATE cmtRECORDCOMMENT set reply_order_cached_data=%s WHERE id=%s", + run_sql("UPDATE cmtRECORDCOMMENT set reply_order_cached_data=_binary %s WHERE id=%s", (reply_order_cached_data, row[0])) # Update WebBasket comments res = run_sql("SELECT id FROM bskRECORDCOMMENT WHERE reply_order_cached_data is NULL") for row in res: reply_order_cached_data = get_reply_order_cache_data(row[0]) - run_sql("UPDATE cmtRECORDCOMMENT set reply_order_cached_data=%s WHERE id=%s", + run_sql("UPDATE cmtRECORDCOMMENT set reply_order_cached_data=_binary %s WHERE id=%s", (reply_order_cached_data, row[0])) diff --git a/modules/websearch/lib/websearch_webcoll.py b/modules/websearch/lib/websearch_webcoll.py index 16662eb90..259a3d7b9 100644 --- a/modules/websearch/lib/websearch_webcoll.py +++ b/modules/websearch/lib/websearch_webcoll.py @@ -1,1224 +1,1224 @@ # This file is part of Invenio. -# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 CERN. +# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Create Invenio collection cache.""" __revision__ = "$Id$" import calendar import copy import sys import cgi import re import os import string import time import cPickle from invenio.config import \ CFG_CERN_SITE, \ CFG_WEBSEARCH_INSTANT_BROWSE, \ CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, \ CFG_WEBSEARCH_I18N_LATEST_ADDITIONS, \ CFG_CACHEDIR, \ CFG_SITE_LANG, \ CFG_SITE_NAME, \ CFG_SITE_LANGS, \ CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, \ CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS, \ CFG_SCOAP3_SITE from invenio.messages import gettext_set_language from invenio.search_engine import search_pattern_parenthesised, get_creation_date, get_field_i18nname, collection_restricted_p, sort_records, EM_REPOSITORY from invenio.search_engine_config import CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES from invenio.dbquery import run_sql, Error, get_table_update_time from invenio.bibrank_record_sorter import get_bibrank_methods from invenio.dateutils import convert_datestruct_to_dategui, strftime from invenio.bibformat import format_record from invenio.shellutils import mymkdir from invenio.intbitset import intbitset from invenio.websearch_external_collections import \ external_collection_load_states, \ dico_collection_external_searches, \ external_collection_sort_engine_by_name from invenio.bibtask import task_init, task_get_option, task_set_option, \ write_message, task_has_option, task_update_progress, \ task_sleep_now_if_required import invenio.template websearch_templates = invenio.template.load('websearch') from invenio.websearch_external_collections_searcher import external_collections_dictionary from invenio.websearch_external_collections_config import CFG_EXTERNAL_COLLECTION_TIMEOUT from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS # global vars COLLECTION_HOUSE = {} # will hold collections we treat in this run of the program; a dict of {collname2, collobject1}, ... # CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE -- cache timestamp # tolerance (in seconds), to account for the fact that an admin might # accidentally happen to edit the collection definitions at exactly # the same second when some webcoll process was about to be started. # In order to be safe, let's put an exaggerated timestamp tolerance # value such as 20 seconds: CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE = 20 # CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE -- location of the cache # timestamp file: CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE = "%s/collections/last_updated" % CFG_CACHEDIR # CFG_CACHE_LAST_FAST_UPDATED_TIMESTAMP_FILE -- location of the cache # timestamp file usef when running webcoll in the fast-mode. CFG_CACHE_LAST_FAST_UPDATED_TIMESTAMP_FILE = "%s/collections/last_fast_updated" % CFG_CACHEDIR def get_collection(colname): """Return collection object from the collection house for given colname. If does not exist, then create it.""" if not COLLECTION_HOUSE.has_key(colname): colobject = Collection(colname) COLLECTION_HOUSE[colname] = colobject return COLLECTION_HOUSE[colname] # auxiliary functions: def is_selected(var, fld): "Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes." if var == fld: return ' selected="selected"' else: return "" def get_field(recID, tag): "Gets list of field 'tag' for the record with 'recID' system number." out = [] digit = tag[0:2] bx = "bib%sx" % digit bibx = "bibrec_bib%sx" % digit query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag='%s'" \ % (bx, bibx, recID, tag) res = run_sql(query) for row in res: out.append(row[0]) return out def check_nbrecs_for_all_external_collections(): """Check if any of the external collections have changed their total number of records, aka nbrecs. Return True if any of the total numbers of records have changed and False if they're all the same.""" res = run_sql("SELECT name FROM collection WHERE dbquery LIKE 'hostedcollection:%';") for row in res: coll_name = row[0] if (get_collection(coll_name)).check_nbrecs_for_external_collection(): return True return False class Collection: "Holds the information on collections (id,name,dbquery)." def __init__(self, name=""): "Creates collection instance by querying the DB configuration database about 'name'." self.calculate_reclist_run_already = 0 # to speed things up without much refactoring self.update_reclist_run_already = 0 # to speed things up without much refactoring self.reclist_updated_since_start = 0 # to check if webpage cache need rebuilding self.reclist_with_nonpublic_subcolls = intbitset() # temporary counters for the number of records in hosted collections self.nbrecs_tmp = None # number of records in a hosted collection self.nbrecs_from_hosted_collections = 0 # total number of records from # descendant hosted collections if not name: self.name = CFG_SITE_NAME # by default we are working on the home page self.id = 1 self.dbquery = None self.nbrecs = None self.reclist = intbitset() self.old_reclist = intbitset() self.reclist_updated_since_start = 1 else: self.name = name try: res = run_sql("""SELECT id,name,dbquery,nbrecs,reclist FROM collection WHERE name=%s""", (name,)) if res: self.id = res[0][0] self.name = res[0][1] self.dbquery = res[0][2] self.nbrecs = res[0][3] try: self.reclist = intbitset(res[0][4]) except: self.reclist = intbitset() self.reclist_updated_since_start = 1 else: # collection does not exist! self.id = None self.dbquery = None self.nbrecs = None self.reclist = intbitset() self.reclist_updated_since_start = 1 self.old_reclist = intbitset(self.reclist) except Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1) def get_example_search_queries(self): """Returns list of sample search queries for this collection. """ res = run_sql("""SELECT example.body FROM example LEFT JOIN collection_example on example.id=collection_example.id_example WHERE collection_example.id_collection=%s ORDER BY collection_example.score""", (self.id,)) return [query[0] for query in res] def get_name(self, ln=CFG_SITE_LANG, name_type="ln", prolog="", epilog="", prolog_suffix=" ", epilog_suffix=""): """Return nicely formatted collection name for language LN. The NAME_TYPE may be 'ln' (=long name), 'sn' (=short name), etc.""" out = prolog i18name = "" res = run_sql("SELECT value FROM collectionname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, name_type)) try: i18name += res[0][0] except IndexError: pass if i18name: out += i18name else: out += self.name out += epilog return out def get_collectionbox_name(self, ln=CFG_SITE_LANG, box_type="r"): """ Return collection-specific labelling of 'Focus on' (regular collection), 'Narrow by' (virtual collection) and 'Latest addition' boxes. If translation for given language does not exist, use label for CFG_SITE_LANG. If no custom label is defined for CFG_SITE_LANG, return default label for the box. @param ln: the language of the label @param box_type: can be 'r' (=Narrow by), 'v' (=Focus on), 'l' (=Latest additions) """ i18name = "" res = run_sql("SELECT value FROM collectionboxname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, box_type)) try: i18name = res[0][0] except IndexError: res = run_sql("SELECT value FROM collectionboxname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, CFG_SITE_LANG, box_type)) try: i18name = res[0][0] except IndexError: pass if not i18name: # load the right message language _ = gettext_set_language(ln) if box_type == "v": i18name = _('Focus on:') elif box_type == "r": if CFG_SCOAP3_SITE: i18name = _('Narrow by publisher/journal:') else: i18name = _('Narrow by collection:') elif box_type == "l": i18name = _('Latest additions:') return i18name def get_ancestors(self): "Returns list of ancestors of the current collection." ancestors = [] ancestors_ids = intbitset() id_son = self.id while 1: query = "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_son=%d AND c.id=cc.id_dad" % int(id_son) res = run_sql(query, None, 1) if res: col_ancestor = get_collection(res[0][1]) # looking for loops if self.id in ancestors_ids: write_message("Loop found in collection %s" % self.name, stream=sys.stderr) raise OverflowError("Loop found in collection %s" % self.name) else: ancestors.append(col_ancestor) ancestors_ids.add(col_ancestor.id) id_son = res[0][0] else: break ancestors.reverse() return ancestors def restricted_p(self): """Predicate to test if the collection is restricted or not. Return the contect of the `restrited' column of the collection table (typically Apache group). Otherwise return None if the collection is public.""" if collection_restricted_p(self.name): return 1 return None def get_sons(self, type='r'): "Returns list of direct sons of type 'type' for the current collection." sons = [] id_dad = self.id query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score ASC, c.name ASC" % (int(id_dad), type) res = run_sql(query) for row in res: sons.append(get_collection(row[1])) return sons def get_descendants(self, type='r'): "Returns list of all descendants of type 'type' for the current collection." descendants = [] descendant_ids = intbitset() id_dad = self.id query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score ASC" % (int(id_dad), type) res = run_sql(query) for row in res: col_desc = get_collection(row[1]) # looking for loops if self.id in descendant_ids: write_message("Loop found in collection %s" % self.name, stream=sys.stderr) raise OverflowError("Loop found in collection %s" % self.name) else: descendants.append(col_desc) descendant_ids.add(col_desc.id) tmp_descendants = col_desc.get_descendants() for descendant in tmp_descendants: descendant_ids.add(descendant.id) descendants += tmp_descendants return descendants def write_cache_file(self, filename='', filebody={}): "Write a file inside collection cache." # open file: dirname = "%s/collections" % (CFG_CACHEDIR) mymkdir(dirname) fullfilename = dirname + "/%s.html" % filename.replace('/', '___SLASH___') try: os.umask(022) f = open(fullfilename, "wb") except IOError, v: try: (code, message) = v except: code = 0 message = v print "I/O Error: " + str(message) + " (" + str(code) + ")" sys.exit(1) # print user info: write_message("... creating %s" % fullfilename, verbose=6) # print page body: cPickle.dump(filebody, f, cPickle.HIGHEST_PROTOCOL) # close file: f.close() def update_webpage_cache(self, lang): """Create collection page header, navtrail, body (including left and right stripes) and footer, and call write_cache_file() afterwards to update the collection webpage cache.""" ## precalculate latest additions for non-aggregate ## collections (the info is ln and as independent) if self.dbquery: if CFG_WEBSEARCH_I18N_LATEST_ADDITIONS: self.create_latest_additions_info(ln=lang) else: self.create_latest_additions_info() # load the right message language _ = gettext_set_language(lang) # create dictionary with data cache = {"te_portalbox" : self.create_portalbox(lang, 'te'), "np_portalbox" : self.create_portalbox(lang, 'np'), "ne_portalbox" : self.create_portalbox(lang, 'ne'), "tp_portalbox" : self.create_portalbox(lang, "tp"), "lt_portalbox" : self.create_portalbox(lang, "lt"), "rt_portalbox" : self.create_portalbox(lang, "rt"), "last_updated" : convert_datestruct_to_dategui(time.localtime(), ln=lang)} for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: # do light, simple and advanced search pages: cache["navtrail_%s" % aas] = self.create_navtrail_links(aas, lang) cache["searchfor_%s" % aas] = self.create_searchfor(aas, lang) cache["narrowsearch_%s" % aas] = self.create_narrowsearch(aas, lang, 'r') cache["focuson_%s" % aas] = self.create_narrowsearch(aas, lang, "v")+ \ self.create_external_collections_box(lang) cache["instantbrowse_%s" % aas] = self.create_instant_browse(aas=aas, ln=lang) # write cache file self.write_cache_file("%s-ln=%s"%(self.name, lang), cache) return cache def create_navtrail_links(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): """Creates navigation trail links, i.e. links to collection ancestors (except Home collection). If aas==1, then links to Advanced Search interfaces; otherwise Simple Search. """ dads = [] for dad in self.get_ancestors(): if dad.name != CFG_SITE_NAME: # exclude Home collection dads.append((dad.name, dad.get_name(ln))) return websearch_templates.tmpl_navtrail_links( aas=aas, ln=ln, dads=dads) def create_portalbox(self, lang=CFG_SITE_LANG, position="rt"): """Creates portalboxes of language CFG_SITE_LANG of the position POSITION by consulting DB configuration database. The position may be: 'lt'='left top', 'rt'='right top', etc.""" out = "" query = "SELECT p.title,p.body FROM portalbox AS p, collection_portalbox AS cp "\ " WHERE cp.id_collection=%d AND p.id=cp.id_portalbox AND cp.ln='%s' AND cp.position='%s' "\ " ORDER BY cp.score DESC" % (self.id, lang, position) res = run_sql(query) for row in res: title, body = row[0], row[1] if title: out += websearch_templates.tmpl_portalbox(title = title, body = body) else: # no title specified, so print body ``as is'' only: out += body return out def create_narrowsearch(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG, type="r"): """Creates list of collection descendants of type 'type' under title 'title'. If aas==1, then links to Advanced Search interfaces; otherwise Simple Search. Suitable for 'Narrow search' and 'Focus on' boxes.""" # get list of sons and analyse it sons = self.get_sons(type) if not sons: return '' # get descendents descendants = self.get_descendants(type) grandsons = [] if CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS: # load grandsons for each son for son in sons: grandsons.append(son.get_sons()) # return "" return websearch_templates.tmpl_narrowsearch( aas = aas, ln = ln, type = type, father = self, has_grandchildren = len(descendants)>len(sons), sons = sons, display_grandsons = CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, grandsons = grandsons ) def create_external_collections_box(self, ln=CFG_SITE_LANG): external_collection_load_states() if not dico_collection_external_searches.has_key(self.id): return "" engines_list = external_collection_sort_engine_by_name(dico_collection_external_searches[self.id]) return websearch_templates.tmpl_searchalso(ln, engines_list, self.id) def create_latest_additions_info(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, ln=CFG_SITE_LANG): """ Create info about latest additions that will be used for create_instant_browse() later. """ self.latest_additions_info = [] if self.nbrecs and self.reclist: # firstly, get last 'rg' records: recIDs = list(self.reclist) of = 'hb' # CERN hack begins: tweak latest additions for selected collections: if CFG_CERN_SITE: # alter recIDs list for some CERN collections: this_year = time.strftime("%Y", time.localtime()) if self.name in ['CERN Yellow Reports','Videos']: last_year = str(int(this_year) - 1) # detect recIDs only from this and past year: recIDs = list(self.reclist & \ search_pattern_parenthesised(p='year:%s or year:%s' % \ (this_year, last_year))) # apply special filters: if self.name in ['Videos']: # select only videos with movies: recIDs = list(intbitset(recIDs) & \ search_pattern_parenthesised(p='collection:"PUBLVIDEOMOVIE" -"Virtual Visit"')) of = 'hvp' if self.name in ['General Talks', 'Academic Training Lectures', 'Summer Student Lectures']: #select only the lectures with material recIDs = list(self.reclist & search_pattern_parenthesised(p='856:MediaArchive')) # sort some CERN collections specially: if self.name in ['Videos', 'Video Clips', 'Video Movies', 'Video News', 'Video Rushes', 'Webcast', 'ATLAS Videos', 'Restricted Video Movies', 'Restricted Video Rushes', 'LHC First Beam Videos', 'CERN openlab Videos']: recIDs = sort_records(None, recIDs, '269__c', 'a') elif self.name in ['LHCb Talks']: recIDs = sort_records(None, recIDs, 'reportnumber', 'a') elif self.name in ['CERN Yellow Reports']: recIDs = sort_records(None, recIDs, '084__a', 'a') elif self.name in ['CERN Courier Issues', 'CERN Courier Articles', 'CERN Bulletin Issues', 'CERN Bulletin Articles']: recIDs = sort_records(None, recIDs, '773__y', 'a') # CERN hack ends. total = len(recIDs) to_display = min(rg, total) for idx in range(total-1, total-to_display-1, -1): recid = recIDs[idx] self.latest_additions_info.append({'id': recid, 'format': format_record(recid, of, ln=ln), 'date': get_creation_date(recid, fmt="%Y-%m-%d
%H:%i")}) return def create_instant_browse(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): "Searches database and produces list of last 'rg' records." if self.restricted_p(): return websearch_templates.tmpl_box_restricted_content(ln = ln) if str(self.dbquery).startswith("hostedcollection:"): return websearch_templates.tmpl_box_hosted_collection(ln = ln) if rg == 0: # do not show latest additions box return "" # CERN hack: do not display latest additions for some CERN collections: if CFG_CERN_SITE and self.name in ['Periodicals', 'Electronic Journals', 'Press Office Photo Selection', 'Press Office Video Selection']: return "" try: self.latest_additions_info latest_additions_info_p = True except: latest_additions_info_p = False if latest_additions_info_p: passIDs = [] for idx in range(0, min(len(self.latest_additions_info), rg)): # CERN hack: display the records in a grid layout, so do not show the related links if CFG_CERN_SITE and self.name in ['Videos']: passIDs.append({'id': self.latest_additions_info[idx]['id'], 'body': self.latest_additions_info[idx]['format'], 'date': self.latest_additions_info[idx]['date']}) else: passIDs.append({'id': self.latest_additions_info[idx]['id'], 'body': self.latest_additions_info[idx]['format'] + \ websearch_templates.tmpl_record_links(recid=self.latest_additions_info[idx]['id'], rm='citation', ln=ln), 'date': self.latest_additions_info[idx]['date']}) if self.nbrecs > rg: url = websearch_templates.build_search_url( cc=self.name, jrec=rg+1, ln=ln, aas=aas) else: url = "" # CERN hack: display the records in a grid layout if CFG_CERN_SITE and self.name in ['Videos']: return websearch_templates.tmpl_instant_browse( aas=aas, ln=ln, recids=passIDs, more_link=url, grid_layout=True, father=self) return websearch_templates.tmpl_instant_browse( aas=aas, ln=ln, recids=passIDs, more_link=url, father=self) return websearch_templates.tmpl_box_no_records(ln=ln) def create_searchoptions(self): "Produces 'Search options' portal box." box = "" query = """SELECT DISTINCT(cff.id_field),f.code,f.name FROM collection_field_fieldvalue AS cff, field AS f WHERE cff.id_collection=%d AND cff.id_fieldvalue IS NOT NULL AND cff.id_field=f.id ORDER BY cff.score DESC""" % self.id res = run_sql(query) if res: for row in res: field_id = row[0] field_code = row[1] field_name = row[2] query_bis = """SELECT fv.value,fv.name FROM fieldvalue AS fv, collection_field_fieldvalue AS cff WHERE cff.id_collection=%d AND cff.type='seo' AND cff.id_field=%d AND fv.id=cff.id_fieldvalue ORDER BY cff.score_fieldvalue DESC, cff.score DESC, fv.name ASC""" % (self.id, field_id) res_bis = run_sql(query_bis) if res_bis: values = [{'value' : '', 'text' : 'any' + ' ' + field_name}] # FIXME: internationalisation of "any" for row_bis in res_bis: values.append({'value' : cgi.escape(row_bis[0], 1), 'text' : row_bis[1]}) box += websearch_templates.tmpl_select( fieldname = field_code, values = values ) return box def create_sortoptions(self, ln=CFG_SITE_LANG): """Produces 'Sort options' portal box.""" # load the right message language _ = gettext_set_language(ln) box = "" query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE id_collection=%d AND cff.type='soo' AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""" % self.id values = [{'value' : '', 'text': "- %s -" % _("latest first")}] res = run_sql(query) if res: for row in res: values.append({'value' : row[0], 'text': get_field_i18nname(row[1], ln)}) else: for tmp in ('title', 'author', 'report number', 'year'): values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) box = websearch_templates.tmpl_select( fieldname = 'sf', css_class = 'address', values = values ) box += websearch_templates.tmpl_select( fieldname = 'so', css_class = 'address', values = [ {'value' : 'a' , 'text' : _("asc.")}, {'value' : 'd' , 'text' : _("desc.")} ] ) return box def create_rankoptions(self, ln=CFG_SITE_LANG): "Produces 'Rank options' portal box." # load the right message language _ = gettext_set_language(ln) values = [{'value' : '', 'text': "- %s %s -" % (string.lower(_("OR")), _("rank by"))}] for (code, name) in get_bibrank_methods(self.id, ln): values.append({'value' : code, 'text': name}) box = websearch_templates.tmpl_select( fieldname = 'rm', css_class = 'address', values = values ) return box def create_displayoptions(self, ln=CFG_SITE_LANG): "Produces 'Display options' portal box." # load the right message language _ = gettext_set_language(ln) values = [] for i in ['10', '25', '50', '100', '250', '500']: values.append({'value' : i, 'text' : i + ' ' + _("results")}) box = websearch_templates.tmpl_select( fieldname = 'rg', selected = str(CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS), css_class = 'address', values = values ) if self.get_sons(): box += websearch_templates.tmpl_select( fieldname = 'sc', css_class = 'address', values = [ {'value' : '1' , 'text' : CFG_SCOAP3_SITE and _("split by publisher/journal") or _("split by collection")}, {'value' : '0' , 'text' : _("single list")} ] ) return box def create_formatoptions(self, ln=CFG_SITE_LANG): "Produces 'Output format options' portal box." # load the right message language _ = gettext_set_language(ln) box = "" values = [] query = """SELECT f.code,f.name FROM format AS f, collection_format AS cf WHERE cf.id_collection=%d AND cf.id_format=f.id AND f.visibility='1' ORDER BY cf.score DESC, f.name ASC""" % self.id res = run_sql(query) if res: for row in res: values.append({'value' : row[0], 'text': row[1]}) else: values.append({'value' : 'hb', 'text' : "HTML %s" % _("brief")}) box = websearch_templates.tmpl_select( fieldname = 'of', css_class = 'address', values = values ) return box def create_searchwithin_selection_box(self, fieldname='f', value='', ln='en'): """Produces 'search within' selection box for the current collection.""" # get values query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='sew' AND cff.id_collection=%d AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""" % self.id res = run_sql(query) values = [{'value' : '', 'text' : get_field_i18nname("any field", ln)}] if res: for row in res: values.append({'value' : row[0], 'text' : get_field_i18nname(row[1], ln)}) else: if CFG_CERN_SITE: for tmp in ['title', 'author', 'abstract', 'report number', 'year']: values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) else: for tmp in ['title', 'author', 'abstract', 'keyword', 'report number', 'journal', 'year', 'fulltext', 'reference']: values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) return websearch_templates.tmpl_searchwithin_select( fieldname = fieldname, ln = ln, selected = value, values = values ) def create_searchexample(self): "Produces search example(s) for the current collection." out = "$collSearchExamples = getSearchExample(%d, $se);" % self.id return out def create_searchfor(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): "Produces either Simple or Advanced 'Search for' box for the current collection." if aas == 2: return self.create_searchfor_addtosearch(ln) elif aas == 1: return self.create_searchfor_advanced(ln) elif aas == 0: return self.create_searchfor_simple(ln) else: return self.create_searchfor_light(ln) def create_searchfor_addtosearch(self, ln=CFG_SITE_LANG): "Produces add-to-search 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_addtosearch( ln=ln, collection_id=self.name, record_count=self.nbrecs, searchwithin= self.create_searchwithin_selection_box(fieldname='f1', ln=ln), ) def create_searchfor_light(self, ln=CFG_SITE_LANG): "Produces light 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_light( ln=ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, example_search_queries=self.get_example_search_queries(), ) def create_searchfor_simple(self, ln=CFG_SITE_LANG): "Produces simple 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_simple( ln=ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, middle_option = self.create_searchwithin_selection_box(ln=ln), ) def create_searchfor_advanced(self, ln=CFG_SITE_LANG): "Produces advanced 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_advanced( ln = ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, middle_option_1 = self.create_searchwithin_selection_box('f1', ln=ln), middle_option_2 = self.create_searchwithin_selection_box('f2', ln=ln), middle_option_3 = self.create_searchwithin_selection_box('f3', ln=ln), searchoptions = self.create_searchoptions(), sortoptions = self.create_sortoptions(ln), rankoptions = self.create_rankoptions(ln), displayoptions = self.create_displayoptions(ln), formatoptions = self.create_formatoptions(ln) ) def calculate_reclist(self): """ Calculate, set and return the (reclist, reclist_with_nonpublic_subcolls, nbrecs_from_hosted_collections) tuple for the given collection.""" if str(self.dbquery).startswith("hostedcollection:"): # we don't normally use this function to calculate the reclist # for hosted collections. In case we do, recursively for a regular # ancestor collection, then quickly return the object attributes. return (self.reclist, self.reclist_with_nonpublic_subcolls, self.nbrecs) if self.calculate_reclist_run_already: # do we really have to recalculate? If not, # then return the object attributes return (self.reclist, self.reclist_with_nonpublic_subcolls, self.nbrecs_from_hosted_collections) write_message("... calculating reclist of %s" % self.name, verbose=6) reclist = intbitset() # will hold results for public sons only; good for storing into DB reclist_with_nonpublic_subcolls = intbitset() # will hold results for both public and nonpublic sons; good for deducing total # number of documents nbrecs_from_hosted_collections = 0 # will hold the total number of records from descendant hosted collections if not self.dbquery: # A - collection does not have dbquery, so query recursively all its sons # that are either non-restricted or that have the same restriction rules for coll in self.get_sons(): coll_reclist,\ coll_reclist_with_nonpublic_subcolls,\ coll_nbrecs_from_hosted_collection = coll.calculate_reclist() if ((coll.restricted_p() is None) or (coll.restricted_p() == self.restricted_p())): # add this reclist ``for real'' only if it is public reclist.union_update(coll_reclist) reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls) # increment the total number of records from descendant hosted collections nbrecs_from_hosted_collections += coll_nbrecs_from_hosted_collection else: # B - collection does have dbquery, so compute it: # (note: explicitly remove DELETED records) if CFG_CERN_SITE: reclist = search_pattern_parenthesised(None, self.dbquery + \ ' -980__:"DELETED" -980__:"DUMMY"', ap=-9) #ap=-9 for allow queries containing hidden tags else: reclist = search_pattern_parenthesised(None, self.dbquery + ' -980__:"DELETED"', ap=-9) #ap=-9 allow queries containing hidden tags reclist_with_nonpublic_subcolls = copy.deepcopy(reclist) # store the results: self.nbrecs_from_hosted_collections = nbrecs_from_hosted_collections self.nbrecs = len(reclist_with_nonpublic_subcolls) + \ nbrecs_from_hosted_collections self.reclist = reclist self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls # last but not least, update the speed-up flag: self.calculate_reclist_run_already = 1 # return the two sets, as well as # the total number of records from descendant hosted collections: return (self.reclist, self.reclist_with_nonpublic_subcolls, self.nbrecs_from_hosted_collections) def calculate_nbrecs_for_external_collection(self, timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT): """Calculate the total number of records, aka nbrecs, for given external collection.""" #if self.calculate_reclist_run_already: # do we have to recalculate? #return self.nbrecs #write_message("... calculating nbrecs of external collection %s" % self.name, verbose=6) if external_collections_dictionary.has_key(self.name): engine = external_collections_dictionary[self.name] if engine.parser: self.nbrecs_tmp = engine.parser.parse_nbrecs(timeout) if self.nbrecs_tmp >= 0: return self.nbrecs_tmp # the parse_nbrecs() function returns negative values for some specific cases # maybe we can handle these specific cases, some warnings or something # for now the total number of records remains silently the same else: return self.nbrecs else: write_message("External collection %s does not have a parser!" % self.name, verbose=6) else: write_message("External collection %s not found!" % self.name, verbose=6) return 0 # last but not least, update the speed-up flag: #self.calculate_reclist_run_already = 1 def check_nbrecs_for_external_collection(self): """Check if the external collections has changed its total number of records, aka nbrecs. Rerurns True if the total number of records has changed and False if it's the same""" write_message("*** self.nbrecs = %s / self.cal...ion = %s ***" % (str(self.nbrecs), str(self.calculate_nbrecs_for_external_collection())), verbose=6) write_message("*** self.nbrecs != self.cal...ion = %s ***" % (str(self.nbrecs != self.calculate_nbrecs_for_external_collection()),), verbose=6) return self.nbrecs != self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS) def set_nbrecs_for_external_collection(self): """Set this external collection's total number of records, aka nbrecs""" if self.calculate_reclist_run_already: # do we have to recalculate? return write_message("... calculating nbrecs of external collection %s" % self.name, verbose=6) if self.nbrecs_tmp: self.nbrecs = self.nbrecs_tmp else: self.nbrecs = self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS) # last but not least, update the speed-up flag: self.calculate_reclist_run_already = 1 def update_reclist(self): "Update the record universe for given collection; nbrecs, reclist of the collection table." if self.update_reclist_run_already: # do we have to reupdate? return 0 write_message("... updating reclist of %s (%s recs)" % (self.name, self.nbrecs), verbose=6) sys.stdout.flush() try: ## In principle we could skip this update if old_reclist==reclist ## however we just update it here in case of race-conditions. - run_sql("UPDATE collection SET nbrecs=%s, reclist=%s WHERE id=%s", + run_sql("UPDATE collection SET nbrecs=%s, reclist=_binary %s WHERE id=%s", (self.nbrecs, self.reclist.fastdump(), self.id)) if self.old_reclist != self.reclist: self.reclist_updated_since_start = 1 else: write_message("... no changes in reclist detected", verbose=6) except Error, e: print "Database Query Error %d: %s." % (e.args[0], e.args[1]) sys.exit(1) # last but not least, update the speed-up flag: self.update_reclist_run_already = 1 return 0 def perform_display_collection(colID, colname, aas, ln, em, show_help_boxes): """Returns the data needed to display a collection page The arguments are as follows: colID - id of the collection to display colname - name of the collection to display aas - 0 if simple search, 1 if advanced search ln - language of the page em - code to display just part of the page show_help_boxes - whether to show the help boxes or not""" # check and update cache if necessary cachedfile = open(r"%s/collections/%s-ln=%s.html" % (CFG_CACHEDIR, colname.replace('/', '___SLASH___'), ln), "rb") try: data = cPickle.load(cachedfile) except ValueError: data = get_collection(colname).update_webpage_cache(ln) cachedfile.close() # check em value to return just part of the page if em != "": if EM_REPOSITORY["search_box"] not in em: data["searchfor_%s" % aas] = "" if EM_REPOSITORY["see_also_box"] not in em: data["focuson_%s" % aas] = "" if EM_REPOSITORY["all_portalboxes"] not in em: if EM_REPOSITORY["te_portalbox"] not in em: data["te_portalbox"] = "" if EM_REPOSITORY["np_portalbox"] not in em: data["np_portalbox"] = "" if EM_REPOSITORY["ne_portalbox"] not in em: data["ne_portalbox"] = "" if EM_REPOSITORY["tp_portalbox"] not in em: data["tp_portalbox"] = "" if EM_REPOSITORY["lt_portalbox"] not in em: data["lt_portalbox"] = "" if EM_REPOSITORY["rt_portalbox"] not in em: data["rt_portalbox"] = "" c_body = websearch_templates.tmpl_webcoll_body(ln, colID, data["te_portalbox"], data["searchfor_%s"%aas], data["np_portalbox"], data["narrowsearch_%s"%aas], data["focuson_%s"%aas], data["instantbrowse_%s"%aas], data["ne_portalbox"], em=="" or EM_REPOSITORY["body"] in em) if show_help_boxes <= 0: data["rt_portalbox"] = "" return (c_body, data["navtrail_%s"%aas], data["lt_portalbox"], data["rt_portalbox"], data["tp_portalbox"], data["te_portalbox"], data["last_updated"]) def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() shift_re = re.compile("([-\+]{0,1})([\d]+)([dhms])") factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = shift_re.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) date = time.localtime(date + sign * factor * value) date = strftime(format_string, date) else: date = time.strptime(var, format_string) date = strftime(format_string, date) return date def get_current_time_timestamp(): """Return timestamp corresponding to the current time.""" return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) def compare_timestamps_with_tolerance(timestamp1, timestamp2, tolerance=0): """Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form '2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument (in seconds). Return -1 if TIMESTAMP1 is less than TIMESTAMP2 minus TOLERANCE, 0 if they are equal within TOLERANCE limit, and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE. """ # remove any trailing .00 in timestamps: timestamp1 = re.sub(r'\.[0-9]+$', '', timestamp1) timestamp2 = re.sub(r'\.[0-9]+$', '', timestamp2) # first convert timestamps to Unix epoch seconds: timestamp1_seconds = calendar.timegm(time.strptime(timestamp1, "%Y-%m-%d %H:%M:%S")) timestamp2_seconds = calendar.timegm(time.strptime(timestamp2, "%Y-%m-%d %H:%M:%S")) # now compare them: if timestamp1_seconds < timestamp2_seconds - tolerance: return -1 elif timestamp1_seconds > timestamp2_seconds + tolerance: return 1 else: return 0 def get_database_last_updated_timestamp(): """Return last updated timestamp for collection-related and record-related database tables. """ database_tables_timestamps = [] database_tables_timestamps.append(get_table_update_time('bibrec')) ## In INSPIRE bibfmt is on innodb and there is not such configuration bibfmt_last_update = run_sql("SELECT max(last_updated) FROM bibfmt") if bibfmt_last_update and bibfmt_last_update[0][0]: database_tables_timestamps.append(str(bibfmt_last_update[0][0])) try: database_tables_timestamps.append(get_table_update_time('idxWORD%')) except ValueError: # There are no indexes in the database. That's OK. pass database_tables_timestamps.append(get_table_update_time('collection%')) database_tables_timestamps.append(get_table_update_time('portalbox')) database_tables_timestamps.append(get_table_update_time('field%')) database_tables_timestamps.append(get_table_update_time('format%')) database_tables_timestamps.append(get_table_update_time('rnkMETHODNAME')) database_tables_timestamps.append(get_table_update_time('accROLE_accACTION_accARGUMENT', run_on_slave=True)) return max(database_tables_timestamps) def get_cache_last_updated_timestamp(): """Return last updated cache timestamp.""" try: f = open(CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE, "r") except: return "1970-01-01 00:00:00" timestamp = f.read() f.close() return timestamp def set_cache_last_updated_timestamp(timestamp): """Set last updated cache timestamp to TIMESTAMP.""" try: f = open(CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE, "w") except: pass f.write(timestamp) f.close() return timestamp def main(): """Main that construct all the bibtask.""" task_init(authorization_action="runwebcoll", authorization_msg="WebColl Task Submission", description="""Description: webcoll updates the collection cache (record universe for a given collection plus web page elements) based on invenio.conf and DB configuration parameters. If the collection name is passed as an argument, only this collection's cache will be updated. If the recursive option is set as well, the collection's descendants will also be updated.\n""", help_specific_usage=" -c, --collection\t Update cache for the given " "collection only. [all]\n" " -r, --recursive\t Update cache for the given collection and all its\n" "\t\t\t descendants (to be used in combination with -c). [no]\n" " -q, --quick\t\t Skip webpage cache update for those collections whose\n" "\t\t\t reclist was not changed. Note: if you use this option, it is advised\n" "\t\t\t to schedule, e.g. a nightly 'webcoll --force'. [no]\n" " -f, --force\t\t Force update even if cache is up to date. [no]\n" " -p, --part\t\t Update only certain cache parts (1=reclist," " 2=webpage). [both]\n" " -l, --language\t Update pages in only certain language" " (e.g. fr,it,...). [all]\n", version=__revision__, specific_params=("c:rqfp:l:", [ "collection=", "recursive", "quick", "force", "part=", "language=" ]), task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter, task_submit_check_options_fnc=task_submit_check_options, task_run_fnc=task_run_core) def task_submit_elaborate_specific_parameter(key, value, opts, args): """ Given the string key it checks it's meaning, eventually using the value. Usually it fills some key in the options dict. It must return True if it has elaborated the key, False, if it doesn't know that key. eg: if key in ['-n', '--number']: self.options['number'] = value return True return False """ if key in ("-c", "--collection"): task_set_option("collection", value) elif key in ("-r", "--recursive"): task_set_option("recursive", 1) elif key in ("-f", "--force"): task_set_option("force", 1) elif key in ("-q", "--quick"): task_set_option("quick", 1) elif key in ("-p", "--part"): task_set_option("part", int(value)) elif key in ("-l", "--language"): languages = task_get_option("language", []) languages += value.split(',') for ln in languages: if ln not in CFG_SITE_LANGS: print 'ERROR: "%s" is not a recognized language code' % ln return False task_set_option("language", languages) else: return False return True def task_submit_check_options(): if task_has_option('collection'): coll = get_collection(task_get_option("collection")) if coll.id is None: print 'ERROR: Collection "%s" does not exist' % coll.name return False return True def task_run_core(): """ Reimplement to add the body of the task.""" # # ------->--->time--->------> # (-1) | ( 0) | ( 1) # | | | # [T.db] | [T.fc] | [T.db] # | | | # |<-tol|tol->| # # the above is the compare_timestamps_with_tolerance result "diagram" # [T.db] stands fore the database timestamp and [T.fc] for the file cache timestamp # ( -1, 0, 1) stand for the returned value # tol stands for the tolerance in seconds # # When a record has been added or deleted from one of the collections the T.db becomes greater that the T.fc # and when webcoll runs it is fully ran. It recalculates the reclists and nbrecs, and since it updates the # collections db table it also updates the T.db. The T.fc is set as the moment the task started running thus # slightly before the T.db (practically the time distance between the start of the task and the last call of # update_reclist). Therefore when webcoll runs again, and even if no database changes have taken place in the # meanwhile, it fully runs (because compare_timestamps_with_tolerance returns 0). This time though, and if # no databases changes have taken place, the T.db remains the same while T.fc is updated and as a result if # webcoll runs again it will not be fully ran # task_run_start_timestamp = get_current_time_timestamp() colls = [] # decide whether we need to run or not, by comparing last updated timestamps: write_message("Database timestamp is %s." % get_database_last_updated_timestamp(), verbose=3) write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp(), verbose=3) if task_has_option("part"): write_message("Running cache update part %s only." % task_get_option("part"), verbose=3) if check_nbrecs_for_all_external_collections() or task_has_option("force") or \ compare_timestamps_with_tolerance(get_database_last_updated_timestamp(), get_cache_last_updated_timestamp(), CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE) >= 0: ## either forced update was requested or cache is not up to date, so recreate it: # firstly, decide which collections to do: if task_has_option("collection"): coll = get_collection(task_get_option("collection")) colls.append(coll) if task_has_option("recursive"): r_type_descendants = coll.get_descendants(type='r') colls += r_type_descendants v_type_descendants = coll.get_descendants(type='v') colls += v_type_descendants else: res = run_sql("SELECT name FROM collection ORDER BY id") for row in res: colls.append(get_collection(row[0])) # secondly, update collection reclist cache: if task_get_option('part', 1) == 1: i = 0 for coll in colls: i += 1 write_message("%s / reclist cache update" % coll.name) if str(coll.dbquery).startswith("hostedcollection:"): coll.set_nbrecs_for_external_collection() else: coll.calculate_reclist() coll.update_reclist() task_update_progress("Part 1/2: done %d/%d" % (i, len(colls))) task_sleep_now_if_required(can_stop_too=True) # thirdly, update collection webpage cache: if task_get_option("part", 2) == 2: # Updates cache only for chosen languages or for all available ones if none was chosen languages = task_get_option("language", CFG_SITE_LANGS) write_message("Cache update for the following languages: %s" % str(languages), verbose=3) i = 0 for coll in colls: i += 1 if coll.reclist_updated_since_start or task_has_option("collection") or task_get_option("force") or not task_get_option("quick"): write_message("%s / webpage cache update" % coll.name) for lang in languages: coll.update_webpage_cache(lang) else: write_message("%s / webpage cache seems not to need an update and --quick was used" % coll.name, verbose=2) task_update_progress("Part 2/2: done %d/%d" % (i, len(colls))) task_sleep_now_if_required(can_stop_too=True) # finally update the cache last updated timestamp: # (but only when all collections were updated, not when only # some of them were forced-updated as per admin's demand) if not task_has_option("collection"): set_cache_last_updated_timestamp(task_run_start_timestamp) write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp(), verbose=3) else: ## cache up to date, we don't have to run write_message("Collection cache is up to date, no need to run.") ## we are done: return True ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/websession/lib/session.py b/modules/websession/lib/session.py index 1ae199a37..9f1f0bd4a 100644 --- a/modules/websession/lib/session.py +++ b/modules/websession/lib/session.py @@ -1,629 +1,629 @@ # -*- coding: utf-8 -*- # This file is part of Invenio. -# Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2015 CERN. +# Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2015, 2016 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Session management adapted from mod_python Session class. Just use L{get_session} to obtain a session object (with a dictionary interface, which will let you store permanent information). """ from invenio.webinterface_handler_wsgi_utils import Cookie, get_cookie import cPickle import time import random import re import sys import os if sys.hexversion < 0x2060000: from md5 import md5 else: from hashlib import md5 from invenio.dateutils import convert_datestruct_to_datetext from invenio.dbquery import blob_to_string from invenio.config import (CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER, CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT, CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS, CFG_WEBSEARCH_PREV_NEXT_HIT_FOR_GUESTS, CFG_WEBSESSION_STORAGE) from invenio.websession_config import (CFG_WEBSESSION_COOKIE_NAME, CFG_WEBSESSION_ONE_DAY, CFG_WEBSESSION_CLEANUP_CHANCE) from invenio.dbquery import run_sql from invenio.redisutils import get_redis CFG_FULL_HTTPS = CFG_SITE_URL.lower().startswith("https://") if CFG_WEBSEARCH_PREV_NEXT_HIT_FOR_GUESTS: _CFG_SESSION_NON_USEFUL_KEYS = ('uid', 'user_info') else: _CFG_SESSION_NON_USEFUL_KEYS = ('uid', 'user_info', 'websearch-last-query', 'websearch-last-query-hits') def get_session(req, sid=None): """ Obtain a session. If the session has already been created for the current request, returns the already existing session. @param req: the mod_python request object. @type req: mod_python request object @param sid: the session identifier of an already existing session. @type sid: 32 hexadecimal string @return: the session. @rtype: InvenioSession @raise ValueError: if C{sid} is provided and it doesn't correspond to a valid session. """ if sid is not None: req._session = InvenioSession(req, sid) return req._session if not hasattr(req, '_session'): req._session = InvenioSession(req, sid) return req._session class InvenioSessionBase(dict): """ This class implements a Session handling based on MySQL. @param req: the mod_python request object. @type req: mod_python request object @param sid: the session identifier if already known @type sid: 32 hexadecimal string @ivar _remember_me: if the session cookie should last one day or until the browser is closed. @type _remember_me: bool @note: The code is heavily based on ModPython 3.3.1 DBMSession implementation. @note: This class implements IP verification to prevent basic cookie stealing. @raise ValueError: if C{sid} is provided and correspond to a broken session. """ def __init__(self, req, sid=None): self._remember_me = False self._req, self._sid, self._secret = req, sid, None self._new = 1 self._created = 0 self._accessed = 0 self._timeout = 0 self._invalid = 0 self._dirty = False self._http_ip = None self._https_ip = None self.__need_https = False self._cleanup_function = None dict.__init__(self) if not self._sid: # check to see if cookie exists cookie = get_cookie(req, CFG_WEBSESSION_COOKIE_NAME) if cookie: self._sid = cookie.value else: stub_cookie = get_cookie(req, CFG_WEBSESSION_COOKIE_NAME + 'stub') self.__need_https = stub_cookie and stub_cookie.value == 'HTTPS' if self._sid: if not _check_sid(self._sid): if sid: # Supplied explicitly by user of the class, # raise an exception and make the user code # deal with it. raise ValueError("Invalid Session ID: sid=%s" % sid) else: # Derived from the cookie sent by browser, # wipe it out so it gets replaced with a # correct value. self._sid = None if self._sid: # attempt to load ourselves if self.load(): self._new = 0 if self._new: # make a new session self._sid = _new_sid(self._req) remote_ip = self._req.remote_ip if self._req.is_https(): self._https_ip = remote_ip else: self._http_ip = remote_ip self._created = time.time() self._timeout = CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT * \ CFG_WEBSESSION_ONE_DAY self._accessed = time.time() # need cleanup? if random.randint(1, CFG_WEBSESSION_CLEANUP_CHANCE) == 1: self.cleanup() if self._new and (not self.__need_https or self._req.is_https()): ## We want to issue cookies only in case this is a new session ## and there is not already a session cookie that is available ## only over HTTPS for cookie in self.make_cookies(): self._req.set_cookie(cookie) def get_dirty(self): """ Is this session dirty? """ return self._dirty def set_dirty(self, dummy=True): """ Flag this session as dirty. It takes a parameter, just in order to be used within a property """ self._dirty = True dirty = property(get_dirty, set_dirty) def __setitem__(self, key, value): if self.get(key) != value: dict.__setitem__(self, key, value) self._dirty = True def __delitem__(self, key): if key in self: dict.__delitem__(self, key) self._dirty = True def set_remember_me(self, remember_me=True): """ Set/Unset the L{_remember_me} flag. @param remember_me: True if the session cookie should last one day or until the browser is closed. @type remember_me: bool """ self._remember_me = remember_me if remember_me: self.set_timeout(CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER * CFG_WEBSESSION_ONE_DAY) else: self.set_timeout(CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT * CFG_WEBSESSION_ONE_DAY) for cookie in self.make_cookies(): self._req.set_cookie(cookie) def load(self): """ Load the session from the database. @return: 1 in case of success, 0 otherwise. @rtype: integer """ session_dict = None invalid = False res = self.load_from_storage(self._sid) if res: session_dict = cPickle.loads(blob_to_string(res)) remote_ip = self._req.remote_ip if self._req.is_https(): if session_dict['_https_ip'] is not None: if ':' in remote_ip: ## IPV6 address, we don't skip bits if session_dict['_https_ip'] != remote_ip: invalid = True else: if _mkip(session_dict['_https_ip']) >> \ CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS != \ _mkip(remote_ip) >> \ CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS: invalid = True else: session_dict['_https_ip'] = remote_ip else: if session_dict['_http_ip'] is not None: if ':' in remote_ip: ## IPV6 address, we don't skip bits if session_dict['_http_ip'] != remote_ip: invalid = True else: if _mkip(session_dict['_http_ip']) >> \ CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS != \ _mkip(remote_ip) >> \ CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS: invalid = True else: session_dict['_http_ip'] = remote_ip if session_dict is None: return 0 if invalid: return 0 if (time.time() - session_dict["_accessed"]) > \ session_dict["_timeout"]: return 0 self._created = session_dict["_created"] self._accessed = session_dict["_accessed"] self._timeout = session_dict["_timeout"] self._remember_me = session_dict["_remember_me"] self.update(session_dict["_data"]) return 1 def is_useful(self): """ Return True if the session contains some key considered useful (i.e. that deserve being preserved) """ for key in self: if key not in _CFG_SESSION_NON_USEFUL_KEYS: return True return False def save(self): """ Save the session to the database. """ uid = self.get('uid', -1) if (not self.__need_https or self._req.is_https()) and not self._invalid and self._sid and self._dirty and (uid > 0 or self.is_useful()): ## We store something only for real users or useful sessions. session_dict = {"_data" : self.copy(), "_created" : self._created, "_accessed": self._accessed, "_timeout" : self._timeout, "_http_ip" : self._http_ip, "_https_ip" : self._https_ip, "_remember_me" : self._remember_me } session_object = cPickle.dumps(session_dict, -1) self.save_in_storage(self._sid, session_object, self._timeout, uid) for cookie in self.make_cookies(): self._req.set_cookie(cookie) ## No more dirty :-) self._dirty = False def delete(self): """ Delete the session. """ self.delete_from_storage(self._sid) self.clear() def invalidate(self): """ Declare the session as invalid. """ cookies = self.make_cookies() for cookie in cookies: cookie.expires = 0 self._req.set_cookie(cookie) self.delete() self._invalid = 1 if hasattr(self._req, '_session'): delattr(self._req, '_session') def make_cookies(self): """ Create the necessary cookies to implement secure session handling (possibly over HTTPS). @return: a list of cookies. """ cookies = [] uid = self.get('uid', -1) if uid > 0 and CFG_SITE_SECURE_URL.startswith("https://"): stub_cookie = Cookie(CFG_WEBSESSION_COOKIE_NAME + 'stub', 'HTTPS', HttpOnly=True) else: stub_cookie = Cookie(CFG_WEBSESSION_COOKIE_NAME + 'stub', 'NO', HttpOnly=True) cookies.append(stub_cookie) if self._req.is_https() or not CFG_SITE_SECURE_URL.startswith("https://") or uid <= 0: cookie = Cookie(CFG_WEBSESSION_COOKIE_NAME, self._sid, HttpOnly=True) if CFG_SITE_SECURE_URL.startswith("https://") and uid > 0: cookie.secure = True cookies.append(cookie) for cookie in cookies: cookie.path = '/' if self._remember_me: cookie.expires = time.time() + self._timeout return cookies def initial_http_ip(self): """ @return: the initial ip addressed for the HTTP protocol for which this session was issued. @rtype: string @note: it returns None if this session has always been used through HTTPS requests. """ return self._http_ip def initial_https_ip(self): """ @return: the initial ip addressed for the HTTPS protocol for which this session was issued. @rtype: string @note: it returns None if this session has always been used through HTTP requests. """ return self._https_ip def is_new(self): """ @return: True if the session has just been created. @rtype: bool """ return not not self._new def sid(self): """ @return: the session identifier. @rtype: 32 hexadecimal string """ return self._sid def created(self): """ @return: the UNIX timestamp for when the session has been created. @rtype: double """ return self._created def last_accessed(self): """ @return: the UNIX timestamp for when the session has been last accessed. @rtype: double """ return self._accessed def timeout(self): """ @return: the number of seconds from the last accessed timestamp, after which the session is invalid. @rtype: double """ return self._timeout def set_timeout(self, secs): """ Set the number of seconds from the last accessed timestamp, after which the session is invalid. @param secs: the number of seconds. @type secs: double """ self._timeout = secs def cleanup(self): """ Perform the database session cleanup. """ if self._cleanup_function: self._req.register_cleanup(self._cleanup_function) self._req.log_error("InvenioSession: registered database cleanup.") ## NOTE: Let's disable __del__ to avoid garbage collection not to ## be able to delete circular references involving the session ## We can .save() anyway in good points, such as at the end of ## of the application request #def __del__(self): #self.save() def get_need_https(self): return self.__need_https ## This property will be True if the connection need to be set to HTTPS ## in order for the session to be successfully read. This can actually ## be checked by not having a cookie, but just having the stub_cookie. ## The default cookie is only sent via HTTPS, while the stub_cookie ## is also sent via HTTP and contains the uid, of the user. So if there ## is actually a stub cookie and its value is different than -1 this ## property will be True, meaning the server should redirect the client ## to an HTTPS connection if she really wants to access authenticated ## resources. need_https = property(get_need_https) def _init_rnd(): """ Initialize random number generators. This is key in multithreaded env, see Python docs for random. @return: the generators. @rtype: list of generators """ # query max number of threads gennum = 10 # make generators # this bit is from Python lib reference random_generator = random.Random(time.time()) result = [random_generator] for dummy in range(gennum - 1): laststate = random_generator.getstate() random_generator = random.Random() random_generator.setstate(laststate) random_generator.jumpahead(1000000) result.append(random_generator) return result _RANDOM_GENERATORS = _init_rnd() _RANDOM_ITERATOR = iter(_RANDOM_GENERATORS) def _get_generator(): """ get rnd_iter.next(), or start over if we reached the end of it @return: the next random number. @rtype: double """ global _RANDOM_ITERATOR try: return _RANDOM_ITERATOR.next() except StopIteration: # the small potential for two threads doing this # seems does not warrant use of a lock _RANDOM_ITERATOR = iter(_RANDOM_GENERATORS) return _RANDOM_ITERATOR.next() _RE_VALIDATE_SID = re.compile('[0-9a-f]{32}$') def _check_sid(sid): """ Check the validity of the session identifier. The sid must be 32 characters long, and consisting of the characters 0-9 and a-f. The sid may be passed in a cookie from the client and as such should not be trusted. This is particularly important in FileSession, where the session filename is derived from the sid. A sid containing '/' or '.' characters could result in a directory traversal attack @param sid: the session identifier. @type sid: string @return: True if the session identifier is valid. @rtype: bool """ return not not _RE_VALIDATE_SID.match(sid) def _new_sid(req): """ Make a number based on current time, pid, remote ip and two random ints, then hash with md5. This should be fairly unique and very difficult to guess. @param req: the mod_python request object. @type req: mod_python request object. @return: the session identifier. @rtype: 32 hexadecimal string @warning: The current implementation of _new_sid returns an md5 hexdigest string. To avoid a possible directory traversal attack in FileSession the sid is validated using the _check_sid() method and the compiled regex validate_sid_re. The sid will be accepted only if len(sid) == 32 and it only contains the characters 0-9 and a-f. If you change this implementation of _new_sid, make sure to also change the validation scheme, as well as the test_Session_illegal_sid() unit test in test/test.py. """ the_time = long(time.time()*10000) pid = os.getpid() random_generator = _get_generator() rnd1 = random_generator.randint(0, 999999999) rnd2 = random_generator.randint(0, 999999999) remote_ip = req.remote_ip return md5("%d%d%d%d%s" % ( the_time, pid, rnd1, rnd2, remote_ip) ).hexdigest() def _mkip(ip): """ Compute a numerical value for a dotted IP """ num = 0L for i in ip.split('.'): num = (num << 8) + int(i) return num class InvenioSessionMySQL(InvenioSessionBase): def __init__(self, req, sid=None): def cb_session_cleanup(data=None): """ Session cleanup procedure which to be executed at the end of the request handling. """ run_sql("""DELETE LOW_PRIORITY FROM session WHERE session_expiry <= UTC_TIMESTAMP()""") self.cleanup_function = cb_session_cleanup super(InvenioSessionMySQL, self).__init__(req, sid) def load_from_storage(self, sid): ret = run_sql("""SELECT session_object FROM session WHERE session_key = %s""", [sid]) if ret: return ret[0][0] def delete_from_storage(self, sid): return run_sql("""DELETE LOW_PRIORITY FROM session WHERE session_key=%s""", [sid]) def save_in_storage(self, sid, session_object, timeout, uid): session_key = sid session_expiry = time.time() + timeout + CFG_WEBSESSION_ONE_DAY session_expiry = convert_datestruct_to_datetext(time.gmtime(session_expiry)) run_sql("""INSERT INTO session( session_key, session_expiry, session_object, uid - ) VALUES (%s, %s, %s, %s) + ) VALUES (%s, %s, _binary %s, %s) ON DUPLICATE KEY UPDATE session_expiry=%s, - session_object=%s, + session_object=_binary %s, uid=%s """, (session_key, session_expiry, session_object, uid, session_expiry, session_object, uid)) class InvenioSessionRedis(InvenioSessionBase): def generate_key(self, sid): return 'session_%s' % sid def load_from_storage(self, sid): return get_redis().get(self.generate_key(sid)) def delete_from_storage(self, sid): return get_redis().delete(self.generate_key(sid)) def save_in_storage(self, sid, session_object, timeout, uid): # pylint: disable=W0613 return get_redis().setex(self.generate_key(sid), session_object, timeout) if CFG_WEBSESSION_STORAGE == 'mysql': InvenioSession = InvenioSessionMySQL elif CFG_WEBSESSION_STORAGE == 'redis': InvenioSession = InvenioSessionRedis diff --git a/requirements.txt b/requirements.txt index 160ec6b0b..77a74e64f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,31 +1,31 @@ # Invenio requirements. MySQL-python==1.2.5 rdflib==2.4.2 reportlab==2.5 python-dateutil<=1.9999 python-magic==0.4.2 -http://www.reportlab.com/ftp/pyRXP-1.16-daily-unix.tar.gz +http://invenio-software.org/download/python/pyRXP-1.16-daily-unix.tar.gz numpy==1.7.0 lxml==3.1.2 mechanize==0.2.5 python-Levenshtein==0.10.2 PyStemmer==1.3.0 -https://py-editdist.googlecode.com/files/py-editdist-0.3.tar.gz +http://invenio-software.org/download/python/py-editdist-0.3.tar.gz feedparser==5.1.3 BeautifulSoup==3.2.1 beautifulsoup4==4.1.3 python-twitter==2.0 msgpack-python==0.3.0 pyparsing==1.5.6 requests==2.2.0 PyPDF2==1.19 rauth==0.6.2 unidecode==0.04.14 python-openid==2.2.5 qrcode==4.0.4 pillow==2.3.0 jinja2==2.7.2 redis==2.9.0 nydus==0.10.6 Cerberus==0.5 matplotlib==1.0.1 diff --git a/scripts/create-instance.sh b/scripts/create-instance.sh index 39257fd04..d52334771 100755 --- a/scripts/create-instance.sh +++ b/scripts/create-instance.sh @@ -1,264 +1,264 @@ #!/usr/bin/env bash # # This file is part of Invenio. # Copyright (C) 2016 CERN. # # Invenio is free software; you can redistribute it # and/or modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the # Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, # MA 02111-1307, USA. # # In applying this license, CERN does not # waive the privileges and immunities granted to it by virtue of its status # as an Intergovernmental Organization or submit itself to any jurisdiction. # quit on errors: set -o errexit # check environment variables: if [ "${INVENIO_MYSQL_HOST}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_MYSQL_HOST before runnning this script." echo "[ERROR] Example: export INVENIO_MYSQL_HOST=192.168.50.11" exit 1 fi if [ "${INVENIO_MYSQL_DBNAME}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBNAME before runnning this script." echo "[ERROR] Example: INVENIO_MYSQL_DBNAME=invenio1" exit 1 fi if [ "${INVENIO_MYSQL_DBUSER}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBUSER before runnning this script." echo "[ERROR] Example: INVENIO_MYSQL_DBUSER=invenio1" exit 1 fi if [ "${INVENIO_MYSQL_DBPASS}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBPASS before runnning this script." echo "[ERROR] Example: INVENIO_MYSQL_DBPASS=dbpass123" exit 1 fi if [ "${INVENIO_WEB_HOST}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_WEB_HOST before runnning this script." echo "[ERROR] Example: export INVENIO_WEB_HOST=192.168.50.10" exit 1 fi if [ "${INVENIO_WEB_DSTDIR}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_WEB_DSTDIR before runnning this script." echo "[ERROR] Example: export INVENIO_WEB_DSTDIR=/opt/invenio" exit 1 fi if [ "${INVENIO_WEB_USER}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_WEB_USER before runnning this script." echo "[ERROR] Example: export INVENIO_WEB_USER=www-data" exit 1 fi # check optional environment variables: INVENIO_WEB_SMTP_PORT=${INVENIO_WEB_SMTP_PORT:=25} # quit on unbound symbols: set -o nounset # runs inside virtual environment? VIRTUAL_ENV=${VIRTUAL_ENV:=} # runs as root or needs sudo? if [[ "$EUID" -ne 0 ]]; then sudo='sudo' else sudo='' fi create_apache_vhost_ubuntu_precise () { sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ssl-cert sudo mkdir -p /etc/apache2/ssl if [ ! -e /etc/apache2/ssl/apache.pem ]; then sudo DEBIAN_FRONTEND=noninteractive /usr/sbin/make-ssl-cert \ /usr/share/ssl-cert/ssleay.cnf /etc/apache2/ssl/apache.pem fi if [ ! -L /etc/apache2/sites-available/invenio.conf ]; then sudo ln -fs "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" \ /etc/apache2/sites-available/invenio.conf fi if [ ! -e "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" ]; then # create them empty for the time being so that apache would start sudo mkdir -p "${INVENIO_WEB_DSTDIR}/etc/apache/" sudo touch "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" sudo chown -R "${INVENIO_WEB_USER}.${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}" fi if [ ! -L /etc/apache2/sites-available/invenio-ssl.conf ]; then sudo ln -fs "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" \ /etc/apache2/sites-available/invenio-ssl.conf fi if [ ! -e "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" ]; then # create them empty for the time being so that apache would start sudo mkdir -p "${INVENIO_WEB_DSTDIR}/etc/apache/" sudo touch "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" sudo chown -R "${INVENIO_WEB_USER}.${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}" fi if [ -e /etc/apache2/sites-available/default-ssl ]; then sudo /usr/sbin/a2dissite "*default*" fi sudo /usr/sbin/a2ensite "invenio*" sudo /usr/sbin/a2enmod ssl sudo /usr/sbin/a2enmod version || echo "[WARNING] Ignoring 'a2enmod version' command; hoping IfVersion is built-in." sudo /usr/sbin/a2enmod xsendfile sudo /etc/init.d/apache2 restart } create_apache_vhost_centos6 () { if ! grep -q "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" /etc/httpd/conf/httpd.conf; then echo "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" | sudo tee -a /etc/httpd/conf/httpd.conf fi if ! grep -q "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" /etc/httpd/conf/httpd.conf; then echo "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" | sudo tee -a /etc/httpd/conf/httpd.conf fi if ! grep -q "TraceEnable off" /etc/httpd/conf/httpd.conf; then echo "TraceEnable off" | sudo tee -a /etc/httpd/conf/httpd.conf fi if ! grep -q "SSLProtocol all -SSLv2" /etc/httpd/conf/httpd.conf; then echo "SSLProtocol all -SSLv2" | sudo tee -a /etc/httpd/conf/httpd.conf fi sudo sed -i 's,^Alias /error/,#Alias /error/,g' /etc/httpd/conf/httpd.conf } create_symlinks () { $sudo mkdir -p "${INVENIO_WEB_DSTDIR}" $sudo chown "${INVENIO_WEB_USER}.${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}" $sudo -u "${INVENIO_WEB_USER}" mkdir -p "${INVENIO_WEB_DSTDIR}/lib/python/invenio" for pythonversion in python2.4 python2.6 python2.7; do for libversion in lib lib64 local/lib local/lib64; do for packageversion in site-packages dist-packages; do if [ -d "/usr/$libversion/$pythonversion/$packageversion/" ] && [ ! -L "/usr/$libversion/$pythonversion/$packageversion/invenio" ]; then $sudo ln -s "${INVENIO_WEB_DSTDIR}/lib/python/invenio" "/usr/$libversion/$pythonversion/$packageversion/invenio" fi done done done } install_sources () { cd "${INVENIO_SRCDIR}" rm -rf autom4te.cache/ aclocal automake -a autoconf ./configure --prefix="${INVENIO_WEB_DSTDIR}" make clean -s make -s sudo -u "${INVENIO_WEB_USER}" make -s install - #sudo -u "${INVENIO_WEB_USER}" make -s install-jquery-plugins + sudo -u "${INVENIO_WEB_USER}" make -s install-jquery-plugins sudo -u "${INVENIO_WEB_USER}" make -s install-mathjax-plugin sudo -u "${INVENIO_WEB_USER}" make -s install-ckeditor-plugin sudo -u "${INVENIO_WEB_USER}" make -s install-pdfa-helper-files sudo -u "${INVENIO_WEB_USER}" make -s install-mediaelement } create_openoffice_tmp_space () { sudo mkdir -p "${INVENIO_WEB_DSTDIR}/var/tmp/ooffice-tmp-files" sudo chown -R nobody "${INVENIO_WEB_DSTDIR}/var/tmp/ooffice-tmp-files" sudo chmod -R 755 "${INVENIO_WEB_DSTDIR}/var/tmp/ooffice-tmp-files" } configure_instance () { # create invenio-local.conf echo "[Invenio] CFG_SITE_URL = http://${INVENIO_WEB_HOST} CFG_SITE_SECURE_URL = https://${INVENIO_WEB_HOST} CFG_DATABASE_HOST = ${INVENIO_MYSQL_HOST} CFG_DATABASE_NAME = ${INVENIO_MYSQL_DBNAME} CFG_DATABASE_USER = ${INVENIO_MYSQL_DBUSER} CFG_DATABASE_PASS = ${INVENIO_MYSQL_DBPASS} CFG_SITE_ADMIN_EMAIL = ${INVENIO_ADMIN_EMAIL} CFG_SITE_SUPPORT_EMAIL = ${INVENIO_ADMIN_EMAIL} CFG_WEBALERT_ALERT_ENGINE_EMAIL = ${INVENIO_ADMIN_EMAIL} CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = ${INVENIO_ADMIN_EMAIL} CFG_WEBCOMMENT_DEFAULT_MODERATOR = ${INVENIO_ADMIN_EMAIL} CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL = ${INVENIO_ADMIN_EMAIL} CFG_BIBCATALOG_SYSTEM_EMAIL_ADDRESS = ${INVENIO_ADMIN_EMAIL} CFG_BIBSCHED_PROCESS_USER = ${INVENIO_WEB_USER} CFG_MISCUTIL_SMTP_PORT = ${INVENIO_WEB_SMTP_PORT} " | \ sudo -u "${INVENIO_WEB_USER}" tee "${INVENIO_WEB_DSTDIR}/etc/invenio-local.conf" # update instance with this information: sudo -u "${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}/bin/inveniocfg" --update-all } create_tables () { sudo -u "${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}/bin/inveniocfg" --create-tables --yes-i-know } create_apache_configuration () { sudo -u "${INVENIO_WEB_USER}" VIRTUAL_ENV="${VIRTUAL_ENV}" "${INVENIO_WEB_DSTDIR}/bin/inveniocfg" --create-apache-conf } restart_apache_ubuntu_precise () { $sudo /etc/init.d/apache2 restart } restart_apache_centos6 () { $sudo /etc/init.d/httpd restart } main () { # detect OS distribution and release version: if hash lsb_release 2> /dev/null; then os_distribution=$(lsb_release -i | cut -f 2) os_release=$(lsb_release -r | cut -f 2 | grep -oE '[0-9]+\.' | cut -d. -f1 | head -1) elif [ -e /etc/redhat-release ]; then os_distribution=$(cut -d ' ' -f 1 /etc/redhat-release) os_release=$(grep -oE '[0-9]+\.' /etc/redhat-release | cut -d. -f1 | head -1) else os_distribution="UNDETECTED" os_release="UNDETECTED" fi # call appropriate provisioning functions: if [ "$os_distribution" = "Ubuntu" ]; then if [ "$os_release" = "12" ]; then create_apache_vhost_ubuntu_precise create_symlinks install_sources create_openoffice_tmp_space configure_instance create_tables create_apache_configuration restart_apache_ubuntu_precise else echo "[ERROR] Sorry, unsupported release ${os_release}." exit 1 fi elif [ "$os_distribution" = "CentOS" ]; then if [ "$os_release" = "6" ]; then create_apache_vhost_centos6 create_symlinks install_sources create_openoffice_tmp_space configure_instance create_tables create_apache_configuration restart_apache_centos6 exit 1 else echo "[ERROR] Sorry, unsupported release ${os_release}." exit 1 fi else echo "[ERROR] Sorry, unsupported distribution ${os_distribution}." exit 1 fi } main diff --git a/scripts/drop-instance.sh b/scripts/drop-instance.sh index ae23a0443..8f0a81369 100755 --- a/scripts/drop-instance.sh +++ b/scripts/drop-instance.sh @@ -1,184 +1,185 @@ #!/usr/bin/env bash # # This file is part of Invenio. # Copyright (C) 2015, 2016 CERN. # # Invenio is free software; you can redistribute it # and/or modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the # Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, # MA 02111-1307, USA. # # In applying this license, CERN does not # waive the privileges and immunities granted to it by virtue of its status # as an Intergovernmental Organization or submit itself to any jurisdiction. # quit on errors: set -o errexit # check environment variables: if [ "${INVENIO_MYSQL_HOST}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_MYSQL_HOST before runnning this script." echo "[ERROR] Example: export INVENIO_MYSQL_HOST=192.168.50.11" exit 1 fi if [ "${INVENIO_MYSQL_DBNAME}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBNAME before runnning this script." echo "[ERROR] Example: INVENIO_MYSQL_DBNAME=invenio1" exit 1 fi if [ "${INVENIO_MYSQL_DBUSER}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBUSER before runnning this script." echo "[ERROR] Example: INVENIO_MYSQL_DBUSER=invenio1" exit 1 fi if [ "${INVENIO_MYSQL_DBPASS}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBPASS before runnning this script." echo "[ERROR] Example: INVENIO_MYSQL_DBPASS=dbpass123" exit 1 fi if [ "${INVENIO_WEB_HOST}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_WEB_HOST before runnning this script." echo "[ERROR] Example: export INVENIO_WEB_HOST=192.168.50.10" exit 1 fi if [ "${INVENIO_WEB_DSTDIR}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_WEB_DSTDIR before runnning this script." echo "[ERROR] Example: export INVENIO_WEB_DSTDIR=/opt/invenio" exit 1 fi if [ "${INVENIO_WEB_USER}" = "" ]; then echo "[ERROR] Please set environment variable INVENIO_WEB_USER before runnning this script." echo "[ERROR] Example: export INVENIO_WEB_USER=www-data" exit 1 fi # quit on unbound symbols: set -o nounset # runs as root or needs sudo? if [[ "$EUID" -ne 0 ]]; then sudo='sudo' else sudo='' fi start_apache_ubuntu_precise () { $sudo /etc/init.d/apache2 start } stop_apache_ubuntu_precise () { $sudo /etc/init.d/apache2 stop } start_apache_centos6 () { $sudo /etc/init.d/httpd start } stop_apache_centos6 () { $sudo /etc/init.d/httpd stop } drop_apache_vhost_ubuntu_precise () { stop_apache_ubuntu_precise if [ -e /etc/apache2/sites-available/default-ssl ]; then $sudo /usr/sbin/a2ensite "*default*" fi if [ -L /etc/apache2/sites-enabled/invenio.conf ]; then $sudo /usr/sbin/a2dissite "invenio*" fi start_apache_ubuntu_precise } drop_apache_vhost_centos6 () { stop_apache_centos6 if grep -q "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" /etc/httpd/conf/httpd.conf; then sudo sed -i "s,^Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf,#Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf,g" /etc/httpd/conf/httpd.conf fi if grep -q "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" /etc/httpd/conf/httpd.conf; then sudo sed -i "s,^Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf,#Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf,g" /etc/httpd/conf/httpd.conf fi sudo sed -i 's,^#Alias /error/,Alias /error/,g' /etc/httpd/conf/httpd.conf start_apache_centos6 } drop_symlinks () { for pythonversion in python2.4 python2.6 python2.7; do for libversion in lib lib64 local/lib local/lib64; do for packageversion in site-packages dist-packages; do if [ -d /usr/$libversion/$pythonversion/$packageversion/ ] && [ ! -L /usr/$libversion/$pythonversion/$packageversion/invenio ]; then $sudo rm /usr/$libversion/$pythonversion/$packageversion/invenio fi done done done } drop_instance_folder () { $sudo rm -rf "${INVENIO_WEB_DSTDIR}/var/tmp/ooffice-tmp-files" - $sudo -u "${INVENIO_WEB_USER}" rm -rf "${INVENIO_WEB_DSTDIR}/*" + # shellcheck disable=SC2086 + $sudo -u "${INVENIO_WEB_USER}" rm -rf ${INVENIO_WEB_DSTDIR}/* } drop_instance_tables () { if [ -e "${INVENIO_WEB_DSTDIR}/bin/inveniocfg" ]; then $sudo -u "${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}/bin/inveniocfg" --drop-tables --yes-i-know fi } main () { # detect OS distribution and release version: if hash lsb_release 2> /dev/null; then os_distribution=$(lsb_release -i | cut -f 2) os_release=$(lsb_release -r | cut -f 2 | grep -oE '[0-9]+\.' | cut -d. -f1 | head -1) elif [ -e /etc/redhat-release ]; then os_distribution=$(cut -d ' ' -f 1 /etc/redhat-release) os_release=$(grep -oE '[0-9]+\.' /etc/redhat-release | cut -d. -f1 | head -1) else os_distribution="UNDETECTED" os_release="UNDETECTED" fi # call appropriate provisioning functions: if [ "$os_distribution" = "Ubuntu" ]; then if [ "$os_release" = "12" ]; then stop_apache_ubuntu_precise drop_instance_tables start_apache_ubuntu_precise drop_apache_vhost_ubuntu_precise drop_instance_folder drop_symlinks else echo "[ERROR] Sorry, unsupported release ${os_release}." exit 1 fi elif [ "$os_distribution" = "CentOS" ]; then if [ "$os_release" = "6" ]; then stop_apache_centos6 drop_instance_tables start_apache_centos6 drop_apache_vhost_centos6 drop_instance_folder drop_symlinks else echo "[ERROR] Sorry, unsupported release ${os_release}." exit 1 fi else echo "[ERROR] Sorry, unsupported distribution ${os_distribution}." exit 1 fi } main